initial commit
This commit is contained in:
commit
3d842dfaf2
|
@ -0,0 +1,10 @@
|
|||
#*
|
||||
*.sublime-*
|
||||
*~
|
||||
.#*
|
||||
.project
|
||||
.settings
|
||||
.DS_Store
|
||||
/examples/bleve_index_json/bleve_index_json
|
||||
/examples/bleve_query/bleve_query
|
||||
/utils/bleve_dump/bleve_dump
|
|
@ -0,0 +1,24 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
package keyword_analyzer
|
||||
|
||||
import (
|
||||
"github.com/couchbaselabs/bleve/analysis"
|
||||
"github.com/couchbaselabs/bleve/analysis/tokenizers/single_token"
|
||||
)
|
||||
|
||||
func NewKeywordAnalyzer() (*analysis.Analyzer, error) {
|
||||
keyword := analysis.Analyzer{
|
||||
CharFilters: []analysis.CharFilter{},
|
||||
Tokenizer: single_token.NewSingleTokenTokenizer(),
|
||||
Filters: []analysis.TokenFilter{},
|
||||
}
|
||||
|
||||
return &keyword, nil
|
||||
}
|
|
@ -0,0 +1,39 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
package standard_analyzer
|
||||
|
||||
import (
|
||||
"github.com/couchbaselabs/bleve/analysis"
|
||||
"github.com/couchbaselabs/bleve/analysis/token_filters/lower_case_filter"
|
||||
"github.com/couchbaselabs/bleve/analysis/token_filters/stop_words_filter"
|
||||
"github.com/couchbaselabs/bleve/analysis/tokenizers/unicode_word_boundary"
|
||||
)
|
||||
|
||||
func NewStandardAnalyzer() (*analysis.Analyzer, error) {
|
||||
lower_case_filter, err := lower_case_filter.NewLowerCaseFilter()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
stop_words_filter, err := stop_words_filter.NewStopWordsFilter()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
standard := analysis.Analyzer{
|
||||
CharFilters: []analysis.CharFilter{},
|
||||
Tokenizer: unicode_word_boundary.NewUnicodeWordBoundaryTokenizer(),
|
||||
TokenFilters: []analysis.TokenFilter{
|
||||
lower_case_filter,
|
||||
stop_words_filter,
|
||||
},
|
||||
}
|
||||
|
||||
return &standard, nil
|
||||
}
|
|
@ -0,0 +1,32 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
package html_char_filter
|
||||
|
||||
import (
|
||||
"regexp"
|
||||
|
||||
"github.com/couchbaselabs/bleve/analysis/char_filters/regexp_char_filter"
|
||||
)
|
||||
|
||||
// the origin of this regex is here:
|
||||
// http://haacked.com/archive/2004/10/25/usingregularexpressionstomatchhtml.aspx/
|
||||
// slightly modified by me to also match the DOCTYPE
|
||||
const htmlTagPattern = `</?[!\w]+((\s+\w+(\s*=\s*(?:".*?"|'.*?'|[^'">\s]+))?)+\s*|\s*)/?>`
|
||||
|
||||
var htmlRegex = regexp.MustCompile(htmlTagPattern)
|
||||
|
||||
type HtmlCharFilter struct {
|
||||
*regexp_char_filter.RegexpCharFilter
|
||||
}
|
||||
|
||||
func NewHtmlCharFilter() *HtmlCharFilter {
|
||||
return &HtmlCharFilter{
|
||||
regexp_char_filter.NewRegexpCharFilter(htmlRegex, []byte{' '}),
|
||||
}
|
||||
}
|
|
@ -0,0 +1,52 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
package html_char_filter
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestHtmlCharFilter(t *testing.T) {
|
||||
tests := []struct {
|
||||
input []byte
|
||||
output []byte
|
||||
}{
|
||||
{
|
||||
input: []byte(`<!DOCTYPE html>
|
||||
<html>
|
||||
<body>
|
||||
|
||||
<h1>My First Heading</h1>
|
||||
|
||||
<p>My first paragraph.</p>
|
||||
|
||||
</body>
|
||||
</html>`),
|
||||
output: []byte(`
|
||||
|
||||
|
||||
|
||||
My First Heading
|
||||
|
||||
My first paragraph.
|
||||
|
||||
|
||||
`),
|
||||
},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
filter := NewHtmlCharFilter()
|
||||
output := filter.Filter(test.input)
|
||||
if !reflect.DeepEqual(output, test.output) {
|
||||
t.Errorf("Expected:\n`%s`\ngot:\n`%s`\nfor:\n`%s`\n", string(test.output), string(output), string(test.input))
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,30 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
package regexp_char_filter
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"regexp"
|
||||
)
|
||||
|
||||
type RegexpCharFilter struct {
|
||||
r *regexp.Regexp
|
||||
replacement []byte
|
||||
}
|
||||
|
||||
func NewRegexpCharFilter(r *regexp.Regexp, replacement []byte) *RegexpCharFilter {
|
||||
return &RegexpCharFilter{
|
||||
r: r,
|
||||
replacement: replacement,
|
||||
}
|
||||
}
|
||||
|
||||
func (s *RegexpCharFilter) Filter(input []byte) []byte {
|
||||
return s.r.ReplaceAllFunc(input, func(in []byte) []byte { return bytes.Repeat(s.replacement, len(in)) })
|
||||
}
|
|
@ -0,0 +1,55 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
package analysis
|
||||
|
||||
type TokenLocation struct {
|
||||
Start int
|
||||
End int
|
||||
Position int
|
||||
}
|
||||
|
||||
type TokenFreq struct {
|
||||
Term []byte
|
||||
Locations []*TokenLocation
|
||||
}
|
||||
|
||||
func TokenFrequency(tokens TokenStream) []*TokenFreq {
|
||||
index := make(map[string]*TokenFreq)
|
||||
|
||||
for _, token := range tokens {
|
||||
curr, ok := index[string(token.Term)]
|
||||
if ok {
|
||||
curr.Locations = append(curr.Locations, &TokenLocation{
|
||||
Start: token.Start,
|
||||
End: token.End,
|
||||
Position: token.Position,
|
||||
})
|
||||
} else {
|
||||
index[string(token.Term)] = &TokenFreq{
|
||||
Term: token.Term,
|
||||
Locations: []*TokenLocation{
|
||||
&TokenLocation{
|
||||
Start: token.Start,
|
||||
End: token.End,
|
||||
Position: token.Position,
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
rv := make([]*TokenFreq, len(index))
|
||||
i := 0
|
||||
for _, tf := range index {
|
||||
rv[i] = tf
|
||||
i += 1
|
||||
}
|
||||
|
||||
return rv
|
||||
}
|
|
@ -0,0 +1,44 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
package length_filter
|
||||
|
||||
import (
|
||||
"unicode/utf8"
|
||||
|
||||
"github.com/couchbaselabs/bleve/analysis"
|
||||
)
|
||||
|
||||
type LengthFilter struct {
|
||||
min int
|
||||
max int
|
||||
}
|
||||
|
||||
func NewLengthFilter(min, max int) (*LengthFilter, error) {
|
||||
return &LengthFilter{
|
||||
min: min,
|
||||
max: max,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (f *LengthFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
rv := make(analysis.TokenStream, 0)
|
||||
|
||||
for _, token := range input {
|
||||
wordLen := utf8.RuneCount(token.Term)
|
||||
if f.min > 0 && f.min > wordLen {
|
||||
continue
|
||||
}
|
||||
if f.max > 0 && f.max < wordLen {
|
||||
continue
|
||||
}
|
||||
rv = append(rv, token)
|
||||
}
|
||||
|
||||
return rv
|
||||
}
|
|
@ -0,0 +1,102 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
package length_filter
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/couchbaselabs/bleve/analysis"
|
||||
)
|
||||
|
||||
func TestLengthFilter(t *testing.T) {
|
||||
|
||||
inputTokenStream := analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("1"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("two"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("three"),
|
||||
},
|
||||
}
|
||||
|
||||
lengthFilter, err := NewLengthFilter(3, 4)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
ouputTokenStream := lengthFilter.Filter(inputTokenStream)
|
||||
if len(ouputTokenStream) != 1 {
|
||||
t.Fatalf("expected 1 output token")
|
||||
}
|
||||
if string(ouputTokenStream[0].Term) != "two" {
|
||||
t.Errorf("expected term `two`, got `%s`", ouputTokenStream[0].Term)
|
||||
}
|
||||
}
|
||||
|
||||
func TestLengthFilterNoMax(t *testing.T) {
|
||||
|
||||
inputTokenStream := analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("1"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("two"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("three"),
|
||||
},
|
||||
}
|
||||
|
||||
lengthFilter, err := NewLengthFilter(3, -1)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
ouputTokenStream := lengthFilter.Filter(inputTokenStream)
|
||||
if len(ouputTokenStream) != 2 {
|
||||
t.Fatalf("expected 2 output token")
|
||||
}
|
||||
if string(ouputTokenStream[0].Term) != "two" {
|
||||
t.Errorf("expected term `two`, got `%s`", ouputTokenStream[0].Term)
|
||||
}
|
||||
if string(ouputTokenStream[1].Term) != "three" {
|
||||
t.Errorf("expected term `three`, got `%s`", ouputTokenStream[0].Term)
|
||||
}
|
||||
}
|
||||
|
||||
func TestLengthFilterNoMin(t *testing.T) {
|
||||
|
||||
inputTokenStream := analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("1"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("two"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("three"),
|
||||
},
|
||||
}
|
||||
|
||||
lengthFilter, err := NewLengthFilter(-1, 4)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
ouputTokenStream := lengthFilter.Filter(inputTokenStream)
|
||||
if len(ouputTokenStream) != 2 {
|
||||
t.Fatalf("expected 2 output token")
|
||||
}
|
||||
if string(ouputTokenStream[0].Term) != "1" {
|
||||
t.Errorf("expected term `1`, got `%s`", ouputTokenStream[0].Term)
|
||||
}
|
||||
if string(ouputTokenStream[1].Term) != "two" {
|
||||
t.Errorf("expected term `two`, got `%s`", ouputTokenStream[0].Term)
|
||||
}
|
||||
}
|
|
@ -0,0 +1,35 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
package lower_case_filter
|
||||
|
||||
import (
|
||||
"strings"
|
||||
|
||||
"github.com/couchbaselabs/bleve/analysis"
|
||||
)
|
||||
|
||||
type LowerCaseFilter struct {
|
||||
}
|
||||
|
||||
func NewLowerCaseFilter() (*LowerCaseFilter, error) {
|
||||
return &LowerCaseFilter{}, nil
|
||||
}
|
||||
|
||||
func (f *LowerCaseFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
rv := make(analysis.TokenStream, 0)
|
||||
|
||||
for _, token := range input {
|
||||
word := string(token.Term)
|
||||
wordLowerCase := strings.ToLower(word)
|
||||
token.Term = []byte(wordLowerCase)
|
||||
rv = append(rv, token)
|
||||
}
|
||||
|
||||
return rv
|
||||
}
|
|
@ -0,0 +1,52 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
package lower_case_filter
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/couchbaselabs/bleve/analysis"
|
||||
)
|
||||
|
||||
func TestLowerCaseFilter(t *testing.T) {
|
||||
|
||||
inputTokenStream := analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ONE"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("two"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ThReE"),
|
||||
},
|
||||
}
|
||||
|
||||
expectedTokenStream := analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("one"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("two"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("three"),
|
||||
},
|
||||
}
|
||||
|
||||
filter, err := NewLowerCaseFilter()
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
ouputTokenStream := filter.Filter(inputTokenStream)
|
||||
if !reflect.DeepEqual(ouputTokenStream, expectedTokenStream) {
|
||||
t.Errorf("expected %#v got %#v", expectedTokenStream, ouputTokenStream)
|
||||
}
|
||||
}
|
|
@ -0,0 +1,46 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
package stemmer_filter
|
||||
|
||||
import (
|
||||
"bitbucket.org/tebeka/snowball"
|
||||
"github.com/couchbaselabs/bleve/analysis"
|
||||
)
|
||||
|
||||
type StemmerFilter struct {
|
||||
lang string
|
||||
stemmer *snowball.Stemmer
|
||||
}
|
||||
|
||||
func NewStemmerFilter(lang string) (*StemmerFilter, error) {
|
||||
stemmer, err := snowball.New(lang)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return &StemmerFilter{
|
||||
lang: lang,
|
||||
stemmer: stemmer,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (s *StemmerFilter) List() []string {
|
||||
return snowball.LangList()
|
||||
}
|
||||
|
||||
func (s *StemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
rv := make(analysis.TokenStream, 0)
|
||||
|
||||
for _, token := range input {
|
||||
stemmed := s.stemmer.Stem(string(token.Term))
|
||||
token.Term = []byte(stemmed)
|
||||
rv = append(rv, token)
|
||||
}
|
||||
|
||||
return rv
|
||||
}
|
|
@ -0,0 +1,52 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
package stemmer_filter
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/couchbaselabs/bleve/analysis"
|
||||
)
|
||||
|
||||
func TestStemmerFilter(t *testing.T) {
|
||||
|
||||
inputTokenStream := analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("walking"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("talked"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("business"),
|
||||
},
|
||||
}
|
||||
|
||||
expectedTokenStream := analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("walk"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("talk"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("busi"),
|
||||
},
|
||||
}
|
||||
|
||||
filter, err := NewStemmerFilter("english")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
ouputTokenStream := filter.Filter(inputTokenStream)
|
||||
if !reflect.DeepEqual(ouputTokenStream, expectedTokenStream) {
|
||||
t.Errorf("expected %#v got %#v", expectedTokenStream, ouputTokenStream)
|
||||
}
|
||||
}
|
|
@ -0,0 +1,53 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
package stop_words_filter
|
||||
|
||||
import (
|
||||
"github.com/couchbaselabs/bleve/analysis"
|
||||
)
|
||||
|
||||
var DEFAULT_STOP_WORDS []string = []string{
|
||||
"a", "an", "and", "are", "as", "at", "be", "but", "by",
|
||||
"for", "if", "in", "into", "is", "it",
|
||||
"no", "not", "of", "on", "or", "such",
|
||||
"that", "the", "their", "then", "there", "these",
|
||||
"they", "this", "to", "was", "will", "with",
|
||||
}
|
||||
|
||||
type StopWordsFilter struct {
|
||||
stopWords map[string]bool
|
||||
}
|
||||
|
||||
func NewStopWordsFilter() (*StopWordsFilter, error) {
|
||||
return &StopWordsFilter{
|
||||
stopWords: buildStopWordMap(DEFAULT_STOP_WORDS),
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (f *StopWordsFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
rv := make(analysis.TokenStream, 0)
|
||||
|
||||
for _, token := range input {
|
||||
word := string(token.Term)
|
||||
_, isStopWord := f.stopWords[word]
|
||||
if !isStopWord {
|
||||
rv = append(rv, token)
|
||||
}
|
||||
}
|
||||
|
||||
return rv
|
||||
}
|
||||
|
||||
func buildStopWordMap(words []string) map[string]bool {
|
||||
rv := make(map[string]bool, len(words))
|
||||
for _, word := range words {
|
||||
rv[word] = true
|
||||
}
|
||||
return rv
|
||||
}
|
|
@ -0,0 +1,55 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
package stop_words_filter
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/couchbaselabs/bleve/analysis"
|
||||
)
|
||||
|
||||
func TestStopWordsFilter(t *testing.T) {
|
||||
|
||||
inputTokenStream := analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("a"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("walk"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("in"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("the"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("park"),
|
||||
},
|
||||
}
|
||||
|
||||
expectedTokenStream := analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("walk"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("park"),
|
||||
},
|
||||
}
|
||||
|
||||
filter, err := NewStopWordsFilter()
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
ouputTokenStream := filter.Filter(inputTokenStream)
|
||||
if !reflect.DeepEqual(ouputTokenStream, expectedTokenStream) {
|
||||
t.Errorf("expected %#v got %#v", expectedTokenStream, ouputTokenStream)
|
||||
}
|
||||
}
|
|
@ -0,0 +1,40 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
package regexp_tokenizer
|
||||
|
||||
import (
|
||||
"regexp"
|
||||
|
||||
"github.com/couchbaselabs/bleve/analysis"
|
||||
)
|
||||
|
||||
type RegexpTokenizer struct {
|
||||
r *regexp.Regexp
|
||||
}
|
||||
|
||||
func NewRegexpTokenizer(r *regexp.Regexp) *RegexpTokenizer {
|
||||
return &RegexpTokenizer{
|
||||
r: r,
|
||||
}
|
||||
}
|
||||
|
||||
func (rt *RegexpTokenizer) Tokenize(input []byte) analysis.TokenStream {
|
||||
matches := rt.r.FindAllIndex(input, -1)
|
||||
rv := make(analysis.TokenStream, len(matches))
|
||||
for i, match := range matches {
|
||||
token := analysis.Token{
|
||||
Term: input[match[0]:match[1]],
|
||||
Start: match[0],
|
||||
End: match[1],
|
||||
Position: i + 1,
|
||||
}
|
||||
rv[i] = &token
|
||||
}
|
||||
return rv
|
||||
}
|
|
@ -0,0 +1,29 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
package simple_word_boundary
|
||||
|
||||
import (
|
||||
"regexp"
|
||||
|
||||
"github.com/couchbaselabs/bleve/analysis/tokenizers/regexp_tokenizer"
|
||||
)
|
||||
|
||||
const wordPattern = `\w+`
|
||||
|
||||
var wordRegex = regexp.MustCompile(wordPattern)
|
||||
|
||||
type SimpleWordBoundaryTokenizer struct {
|
||||
*regexp_tokenizer.RegexpTokenizer
|
||||
}
|
||||
|
||||
func NewSimpleWordBoundaryTokenizer() *SimpleWordBoundaryTokenizer {
|
||||
return &SimpleWordBoundaryTokenizer{
|
||||
regexp_tokenizer.NewRegexpTokenizer(wordRegex),
|
||||
}
|
||||
}
|
|
@ -0,0 +1,51 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
package simple_word_boundary
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/couchbaselabs/bleve/analysis"
|
||||
)
|
||||
|
||||
func TestBoundary(t *testing.T) {
|
||||
|
||||
tests := []struct {
|
||||
input []byte
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
{
|
||||
[]byte("Hello World."),
|
||||
analysis.TokenStream{
|
||||
{
|
||||
0,
|
||||
5,
|
||||
[]byte("Hello"),
|
||||
1,
|
||||
},
|
||||
{
|
||||
6,
|
||||
11,
|
||||
[]byte("World"),
|
||||
2,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
tokenizer := NewSimpleWordBoundaryTokenizer()
|
||||
actual := tokenizer.Tokenize(test.input)
|
||||
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("Expected %v, got %v for %s", test.output, actual, string(test.input))
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,31 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
package single_token
|
||||
|
||||
import (
|
||||
"github.com/couchbaselabs/bleve/analysis"
|
||||
)
|
||||
|
||||
type SingleTokenTokenizer struct {
|
||||
}
|
||||
|
||||
func NewSingleTokenTokenizer() *SingleTokenTokenizer {
|
||||
return &SingleTokenTokenizer{}
|
||||
}
|
||||
|
||||
func (t *SingleTokenTokenizer) Tokenize(input []byte) analysis.TokenStream {
|
||||
return analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: input,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: len(input),
|
||||
},
|
||||
}
|
||||
}
|
|
@ -0,0 +1,67 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
package single_token
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/couchbaselabs/bleve/analysis"
|
||||
)
|
||||
|
||||
func TestSingleTokenTokenizer(t *testing.T) {
|
||||
|
||||
tests := []struct {
|
||||
input []byte
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
{
|
||||
[]byte("Hello World"),
|
||||
analysis.TokenStream{
|
||||
{
|
||||
0,
|
||||
11,
|
||||
[]byte("Hello World"),
|
||||
1,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
[]byte("こんにちは世界"),
|
||||
analysis.TokenStream{
|
||||
{
|
||||
0,
|
||||
21,
|
||||
[]byte("こんにちは世界"),
|
||||
1,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
[]byte("แยกคำภาษาไทยก็ทำได้นะจ้ะ"),
|
||||
analysis.TokenStream{
|
||||
{
|
||||
0,
|
||||
72,
|
||||
[]byte("แยกคำภาษาไทยก็ทำได้นะจ้ะ"),
|
||||
1,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
tokenizer := NewSingleTokenTokenizer()
|
||||
actual := tokenizer.Tokenize(test.input)
|
||||
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("Expected %v, got %v for %s", test.output, actual, string(test.input))
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,114 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
package unicode_word_boundary
|
||||
|
||||
// #cgo pkg-config: icu-uc
|
||||
// #include <stdio.h>
|
||||
// #include <stdlib.h>
|
||||
// #include "unicode/utypes.h"
|
||||
// #include "unicode/uchar.h"
|
||||
// #include "unicode/ubrk.h"
|
||||
// #include "unicode/ustring.h"
|
||||
import "C"
|
||||
|
||||
import "log"
|
||||
import "unsafe"
|
||||
import "github.com/couchbaselabs/bleve/analysis"
|
||||
|
||||
type UnicodeWordBoundaryTokenizer struct {
|
||||
locale *C.char
|
||||
}
|
||||
|
||||
func NewUnicodeWordBoundaryTokenizer() *UnicodeWordBoundaryTokenizer {
|
||||
return &UnicodeWordBoundaryTokenizer{}
|
||||
}
|
||||
|
||||
func NewUnicodeWordBoundaryCustomLocaleTokenizer(locale string) *UnicodeWordBoundaryTokenizer {
|
||||
return &UnicodeWordBoundaryTokenizer{
|
||||
locale: C.CString(locale),
|
||||
}
|
||||
}
|
||||
|
||||
func (t *UnicodeWordBoundaryTokenizer) Tokenize(input []byte) analysis.TokenStream {
|
||||
// var bi *C.UBreakIterator
|
||||
rv := make(analysis.TokenStream, 0)
|
||||
defer C.free(unsafe.Pointer(t.locale))
|
||||
|
||||
if len(input) < 1 {
|
||||
return rv
|
||||
}
|
||||
|
||||
// works
|
||||
var myUnsafePointer = unsafe.Pointer(&(input[0]))
|
||||
var myCCharPointer *C.char = (*C.char)(myUnsafePointer)
|
||||
|
||||
var inlen C.int32_t = C.int32_t(len(input))
|
||||
var buflen C.int32_t = C.int32_t(2*len(input) + 1) // worse case each byte becomes 2
|
||||
var stringToExamine []C.UChar = make([]C.UChar, buflen)
|
||||
//log.Printf("new buff is: %v", stringToExamine)
|
||||
var myUnsafePointerToExamine = unsafe.Pointer(&(stringToExamine[0]))
|
||||
var myUCharPointer *C.UChar = (*C.UChar)(myUnsafePointerToExamine)
|
||||
C.u_uastrncpy(myUCharPointer, myCCharPointer, inlen)
|
||||
|
||||
//log.Printf("after copy new buff is: %v", stringToExamine)
|
||||
|
||||
var err C.UErrorCode = C.U_ZERO_ERROR
|
||||
bi := C.ubrk_open(C.UBRK_WORD, t.locale, myUCharPointer, -1, &err)
|
||||
|
||||
if err > C.U_ZERO_ERROR {
|
||||
log.Printf("error opening boundary iterator")
|
||||
return rv
|
||||
}
|
||||
|
||||
defer C.ubrk_close(bi)
|
||||
|
||||
position := 0
|
||||
var prev C.int32_t
|
||||
p := C.ubrk_first(bi)
|
||||
for p != C.UBRK_DONE {
|
||||
|
||||
q := C.ubrk_getRuleStatus(bi)
|
||||
|
||||
// convert boundaries back to utf8 positions
|
||||
var nilCString *C.char
|
||||
var indexA C.int32_t
|
||||
|
||||
C.u_strToUTF8(nilCString, 0, &indexA, myUCharPointer, prev, &err)
|
||||
if err > C.U_ZERO_ERROR && err != C.U_BUFFER_OVERFLOW_ERROR {
|
||||
log.Printf("error converting boundary %d", err)
|
||||
return rv
|
||||
} else {
|
||||
err = C.U_ZERO_ERROR
|
||||
}
|
||||
|
||||
var indexB C.int32_t
|
||||
C.u_strToUTF8(nilCString, 0, &indexB, myUCharPointer, p, &err)
|
||||
if err > C.U_ZERO_ERROR && err != C.U_BUFFER_OVERFLOW_ERROR {
|
||||
log.Printf("error converting boundary %d", err)
|
||||
return rv
|
||||
} else {
|
||||
err = C.U_ZERO_ERROR
|
||||
}
|
||||
|
||||
if q != 0 {
|
||||
position += 1
|
||||
token := analysis.Token{
|
||||
Start: int(indexA),
|
||||
End: int(indexB),
|
||||
Term: input[indexA:indexB],
|
||||
Position: position,
|
||||
}
|
||||
rv = append(rv, &token)
|
||||
}
|
||||
prev = p
|
||||
p = C.ubrk_next(bi)
|
||||
}
|
||||
|
||||
return rv
|
||||
}
|
|
@ -0,0 +1,125 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
package unicode_word_boundary
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/couchbaselabs/bleve/analysis"
|
||||
)
|
||||
|
||||
func TestBoundary(t *testing.T) {
|
||||
|
||||
tests := []struct {
|
||||
input []byte
|
||||
locale string
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
{
|
||||
[]byte("Hello World"),
|
||||
"en_US",
|
||||
analysis.TokenStream{
|
||||
{
|
||||
0,
|
||||
5,
|
||||
[]byte("Hello"),
|
||||
1,
|
||||
},
|
||||
{
|
||||
6,
|
||||
11,
|
||||
[]byte("World"),
|
||||
2,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
[]byte("こんにちは世界"),
|
||||
"en_US",
|
||||
analysis.TokenStream{
|
||||
{
|
||||
0,
|
||||
15,
|
||||
[]byte("こんにちは"),
|
||||
1,
|
||||
},
|
||||
{
|
||||
15,
|
||||
21,
|
||||
[]byte("世界"),
|
||||
2,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
[]byte("แยกคำภาษาไทยก็ทำได้นะจ้ะ"),
|
||||
"th_TH",
|
||||
analysis.TokenStream{
|
||||
{
|
||||
0,
|
||||
9,
|
||||
[]byte("แยก"),
|
||||
1,
|
||||
},
|
||||
{
|
||||
9,
|
||||
15,
|
||||
[]byte("คำ"),
|
||||
2,
|
||||
},
|
||||
{
|
||||
15,
|
||||
27,
|
||||
[]byte("ภาษา"),
|
||||
3,
|
||||
},
|
||||
{
|
||||
27,
|
||||
36,
|
||||
[]byte("ไทย"),
|
||||
4,
|
||||
},
|
||||
{
|
||||
36,
|
||||
42,
|
||||
[]byte("ก็"),
|
||||
5,
|
||||
},
|
||||
{
|
||||
42,
|
||||
57,
|
||||
[]byte("ทำได้"),
|
||||
6,
|
||||
},
|
||||
{
|
||||
57,
|
||||
63,
|
||||
[]byte("นะ"),
|
||||
7,
|
||||
},
|
||||
{
|
||||
63,
|
||||
72,
|
||||
[]byte("จ้ะ"),
|
||||
8,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
tokenizer := NewUnicodeWordBoundaryCustomLocaleTokenizer(test.locale)
|
||||
actual := tokenizer.Tokenize(test.input)
|
||||
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("Expected %v, got %v for %s", test.output, actual, string(test.input))
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,59 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
package analysis
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
)
|
||||
|
||||
type CharFilter interface {
|
||||
Filter([]byte) []byte
|
||||
}
|
||||
|
||||
type Token struct {
|
||||
Start int
|
||||
End int
|
||||
Term []byte
|
||||
Position int
|
||||
}
|
||||
|
||||
func (t *Token) String() string {
|
||||
return fmt.Sprintf("Start: %d End: %d Position: %d Token: %s", t.Start, t.End, t.Position, string(t.Term))
|
||||
}
|
||||
|
||||
type TokenStream []*Token
|
||||
|
||||
type Tokenizer interface {
|
||||
Tokenize([]byte) TokenStream
|
||||
}
|
||||
|
||||
type TokenFilter interface {
|
||||
Filter(TokenStream) TokenStream
|
||||
}
|
||||
|
||||
type Analyzer struct {
|
||||
CharFilters []CharFilter
|
||||
Tokenizer Tokenizer
|
||||
TokenFilters []TokenFilter
|
||||
}
|
||||
|
||||
func (a *Analyzer) Analyze(input []byte) TokenStream {
|
||||
if a.CharFilters != nil {
|
||||
for _, cf := range a.CharFilters {
|
||||
input = cf.Filter(input)
|
||||
}
|
||||
}
|
||||
tokens := a.Tokenizer.Tokenize(input)
|
||||
if a.TokenFilters != nil {
|
||||
for _, tf := range a.TokenFilters {
|
||||
tokens = tf.Filter(tokens)
|
||||
}
|
||||
}
|
||||
return tokens
|
||||
}
|
|
@ -0,0 +1,34 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
package document
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
)
|
||||
|
||||
type Document struct {
|
||||
ID string `json:"id"`
|
||||
Fields []*Field `json:"fields"`
|
||||
}
|
||||
|
||||
func NewDocument(id string) *Document {
|
||||
return &Document{
|
||||
ID: id,
|
||||
Fields: make([]*Field, 0),
|
||||
}
|
||||
}
|
||||
|
||||
func (d *Document) AddField(f *Field) {
|
||||
d.Fields = append(d.Fields, f)
|
||||
}
|
||||
|
||||
func (d *Document) String() string {
|
||||
bytes, _ := json.MarshalIndent(d, "", " ")
|
||||
return string(bytes)
|
||||
}
|
|
@ -0,0 +1,29 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
package document
|
||||
|
||||
import (
|
||||
"github.com/couchbaselabs/bleve/analysis"
|
||||
)
|
||||
|
||||
type Field struct {
|
||||
Name string
|
||||
IndexingOptions int
|
||||
Analyzer *analysis.Analyzer
|
||||
Value []byte
|
||||
}
|
||||
|
||||
func NewField(name string, value []byte, indexingOptions int, analyzer *analysis.Analyzer) *Field {
|
||||
return &Field{
|
||||
Name: name,
|
||||
IndexingOptions: indexingOptions,
|
||||
Analyzer: analyzer,
|
||||
Value: value,
|
||||
}
|
||||
}
|
|
@ -0,0 +1,41 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
package document
|
||||
|
||||
import (
|
||||
"log"
|
||||
|
||||
"github.com/couchbaselabs/bleve/analysis"
|
||||
"github.com/couchbaselabs/bleve/analysis/analyzers/standard_analyzer"
|
||||
)
|
||||
|
||||
var standardAnalyzer *analysis.Analyzer
|
||||
|
||||
func init() {
|
||||
var err error
|
||||
standardAnalyzer, err = standard_analyzer.NewStandardAnalyzer()
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
const DEFAULT_TEXT_INDEXING_OPTIONS = INDEX_FIELD
|
||||
|
||||
func NewTextField(name string, value []byte) *Field {
|
||||
return NewTextFieldWithIndexingOptions(name, value, DEFAULT_TEXT_INDEXING_OPTIONS)
|
||||
}
|
||||
|
||||
func NewTextFieldWithIndexingOptions(name string, value []byte, indexingOptions int) *Field {
|
||||
return &Field{
|
||||
Name: name,
|
||||
IndexingOptions: indexingOptions,
|
||||
Analyzer: standardAnalyzer,
|
||||
Value: value,
|
||||
}
|
||||
}
|
|
@ -0,0 +1,27 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
package document
|
||||
|
||||
const (
|
||||
INDEX_FIELD = 1 << iota
|
||||
STORE_FIELD
|
||||
INCLUDE_TERM_VECTORS
|
||||
)
|
||||
|
||||
func IsIndexedField(arg int) bool {
|
||||
return arg&INDEX_FIELD != 0
|
||||
}
|
||||
|
||||
func IsStoredField(arg int) bool {
|
||||
return arg&STORE_FIELD != 0
|
||||
}
|
||||
|
||||
func IncludeTermVectors(arg int) bool {
|
||||
return arg&INCLUDE_TERM_VECTORS != 0
|
||||
}
|
|
@ -0,0 +1,68 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
package document
|
||||
|
||||
import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestIndexingOptions(t *testing.T) {
|
||||
tests := []struct {
|
||||
indexingOptions int
|
||||
isIndexed bool
|
||||
isStored bool
|
||||
includeTermVectors bool
|
||||
}{
|
||||
{
|
||||
indexingOptions: INDEX_FIELD | STORE_FIELD | INCLUDE_TERM_VECTORS,
|
||||
isIndexed: true,
|
||||
isStored: true,
|
||||
includeTermVectors: true,
|
||||
},
|
||||
{
|
||||
indexingOptions: INDEX_FIELD | INCLUDE_TERM_VECTORS,
|
||||
isIndexed: true,
|
||||
isStored: false,
|
||||
includeTermVectors: true,
|
||||
},
|
||||
{
|
||||
indexingOptions: STORE_FIELD | INCLUDE_TERM_VECTORS,
|
||||
isIndexed: false,
|
||||
isStored: true,
|
||||
includeTermVectors: true,
|
||||
},
|
||||
{
|
||||
indexingOptions: INDEX_FIELD,
|
||||
isIndexed: true,
|
||||
isStored: false,
|
||||
includeTermVectors: false,
|
||||
},
|
||||
{
|
||||
indexingOptions: STORE_FIELD,
|
||||
isIndexed: false,
|
||||
isStored: true,
|
||||
includeTermVectors: false,
|
||||
},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
actuallyIndexed := IsIndexedField(test.indexingOptions)
|
||||
if actuallyIndexed != test.isIndexed {
|
||||
t.Errorf("expected indexed to be %v, got %v for %d", test.isIndexed, actuallyIndexed, test.indexingOptions)
|
||||
}
|
||||
actuallyStored := IsStoredField(test.indexingOptions)
|
||||
if actuallyStored != test.isStored {
|
||||
t.Errorf("expected stored to be %v, got %v for %d", test.isStored, actuallyStored, test.indexingOptions)
|
||||
}
|
||||
actuallyIncludeTermVectors := IncludeTermVectors(test.indexingOptions)
|
||||
if actuallyIncludeTermVectors != test.includeTermVectors {
|
||||
t.Errorf("expected includeTermVectors to be %v, got %v for %d", test.includeTermVectors, actuallyIncludeTermVectors, test.indexingOptions)
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,63 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
package main
|
||||
|
||||
import (
|
||||
"flag"
|
||||
"io/ioutil"
|
||||
"log"
|
||||
|
||||
"github.com/couchbaselabs/bleve/index/upside_down"
|
||||
"github.com/couchbaselabs/bleve/shredder"
|
||||
)
|
||||
|
||||
var jsonDir = flag.String("jsonDir", "json", "json directory")
|
||||
var indexDir = flag.String("indexDir", "index", "index directory")
|
||||
|
||||
func main() {
|
||||
|
||||
flag.Parse()
|
||||
|
||||
// create a automatic JSON document shredder
|
||||
jsonShredder := shredder.NewAutoJsonShredder()
|
||||
|
||||
// create a new index
|
||||
index := upside_down.NewUpsideDownCouch(*indexDir)
|
||||
err := index.Open()
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
defer index.Close()
|
||||
|
||||
// open the directory
|
||||
dirEntries, err := ioutil.ReadDir(*jsonDir)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
// walk the directory entries
|
||||
for _, dirEntry := range dirEntries {
|
||||
// read the bytes
|
||||
jsonBytes, err := ioutil.ReadFile(*jsonDir + "/" + dirEntry.Name())
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
// shred them into a document
|
||||
doc, err := jsonShredder.Shred(dirEntry.Name(), jsonBytes)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
//log.Printf("%+v", doc)
|
||||
// update the index
|
||||
err = index.Update(doc)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,70 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
package main
|
||||
|
||||
import (
|
||||
"flag"
|
||||
"fmt"
|
||||
"log"
|
||||
|
||||
"github.com/couchbaselabs/bleve/index/upside_down"
|
||||
"github.com/couchbaselabs/bleve/search"
|
||||
)
|
||||
|
||||
var field = flag.String("field", "description", "field to query")
|
||||
var indexDir = flag.String("indexDir", "index", "index directory")
|
||||
var limit = flag.Int("limit", 10, "limit to first N results")
|
||||
|
||||
func main() {
|
||||
|
||||
flag.Parse()
|
||||
|
||||
if flag.NArg() < 1 {
|
||||
log.Fatal("Specify search term")
|
||||
}
|
||||
|
||||
// open index
|
||||
index := upside_down.NewUpsideDownCouch(*indexDir)
|
||||
err := index.Open()
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
defer index.Close()
|
||||
|
||||
tq := search.TermQuery{
|
||||
Term: flag.Arg(0),
|
||||
Field: *field,
|
||||
BoostVal: 1.0,
|
||||
Explain: true,
|
||||
}
|
||||
collector := search.NewTopScorerCollector(*limit)
|
||||
searcher, err := tq.Searcher(index)
|
||||
if err != nil {
|
||||
log.Fatalf("searcher error: %v", err)
|
||||
return
|
||||
}
|
||||
err = collector.Collect(searcher)
|
||||
if err != nil {
|
||||
log.Fatalf("search error: %v", err)
|
||||
return
|
||||
}
|
||||
results := collector.Results()
|
||||
if len(results) == 0 {
|
||||
fmt.Printf("No matches\n")
|
||||
} else {
|
||||
last := uint64(*limit)
|
||||
if searcher.Count() < last {
|
||||
last = searcher.Count()
|
||||
}
|
||||
fmt.Printf("%d matches, showing %d through %d\n", searcher.Count(), 1, last)
|
||||
for i, result := range results {
|
||||
fmt.Printf("%2d. %s (%f)\n", i+1, result.ID, result.Score)
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,48 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
package index
|
||||
|
||||
import (
|
||||
"github.com/couchbaselabs/bleve/document"
|
||||
)
|
||||
|
||||
type Index interface {
|
||||
Open() error
|
||||
Close()
|
||||
|
||||
Update(doc *document.Document) error
|
||||
Delete(id string) error
|
||||
|
||||
TermFieldReader(term []byte, field string) (TermFieldReader, error)
|
||||
|
||||
DocCount() uint64
|
||||
|
||||
Dump()
|
||||
}
|
||||
|
||||
type TermFieldVector struct {
|
||||
Field string
|
||||
Pos uint64
|
||||
Start uint64
|
||||
End uint64
|
||||
}
|
||||
|
||||
type TermFieldDoc struct {
|
||||
ID string
|
||||
Freq uint64
|
||||
Norm float64
|
||||
Vectors []*TermFieldVector
|
||||
}
|
||||
|
||||
type TermFieldReader interface {
|
||||
Next() (*TermFieldDoc, error)
|
||||
Advance(ID string) (*TermFieldDoc, error)
|
||||
Count() uint64
|
||||
Close()
|
||||
}
|
|
@ -0,0 +1,227 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
package mock
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"math"
|
||||
"sort"
|
||||
|
||||
"github.com/couchbaselabs/bleve/analysis"
|
||||
"github.com/couchbaselabs/bleve/document"
|
||||
"github.com/couchbaselabs/bleve/index"
|
||||
)
|
||||
|
||||
type mockFreq struct {
|
||||
freq uint64
|
||||
norm float64
|
||||
vectors []*index.TermFieldVector
|
||||
}
|
||||
|
||||
// key doc id
|
||||
type mockDocFreq map[string]*mockFreq
|
||||
|
||||
//key field
|
||||
type mockFieldDocFreq map[string]mockDocFreq
|
||||
|
||||
// 2 dim array
|
||||
// inner level are always pairs (field name, term)
|
||||
type mockBackIndexEntry [][]string
|
||||
|
||||
type MockIndex struct {
|
||||
|
||||
//this level of the map, the key is the term
|
||||
termIndex map[string]mockFieldDocFreq
|
||||
|
||||
// key is docid
|
||||
backIndex map[string]mockBackIndexEntry
|
||||
|
||||
docCount uint64
|
||||
analyzer map[string]*analysis.Analyzer
|
||||
}
|
||||
|
||||
func NewMockIndexWithDocs(docs []*document.Document) *MockIndex {
|
||||
rv := NewMockIndex()
|
||||
for _, doc := range docs {
|
||||
rv.Update(doc)
|
||||
}
|
||||
return rv
|
||||
}
|
||||
|
||||
func NewMockIndex() *MockIndex {
|
||||
mi := MockIndex{
|
||||
termIndex: make(map[string]mockFieldDocFreq),
|
||||
backIndex: make(map[string]mockBackIndexEntry),
|
||||
analyzer: make(map[string]*analysis.Analyzer),
|
||||
}
|
||||
|
||||
return &mi
|
||||
}
|
||||
|
||||
func (index *MockIndex) Open() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (index *MockIndex) Close() {}
|
||||
|
||||
// for this implementation we dont care about performance
|
||||
// update is simply delete then add
|
||||
func (index *MockIndex) Update(doc *document.Document) error {
|
||||
index.Delete(doc.ID)
|
||||
|
||||
backIndexEntry := make(mockBackIndexEntry, 0)
|
||||
for _, field := range doc.Fields {
|
||||
|
||||
analyzer := field.Analyzer
|
||||
tokens := analyzer.Analyze(field.Value)
|
||||
fieldLength := len(tokens) // number of tokens in this doc field
|
||||
fieldNorm := 1.0 / math.Sqrt(float64(fieldLength))
|
||||
tokenFreqs := analysis.TokenFrequency(tokens)
|
||||
for _, tf := range tokenFreqs {
|
||||
mf := mockFreq{
|
||||
freq: uint64(len(tf.Locations)),
|
||||
norm: fieldNorm,
|
||||
}
|
||||
if document.IncludeTermVectors(field.IndexingOptions) {
|
||||
mf.vectors = index.mockVectorsFromTokenFreq(field.Name, tf)
|
||||
}
|
||||
termString := string(tf.Term)
|
||||
fieldMap, ok := index.termIndex[termString]
|
||||
if !ok {
|
||||
fieldMap = make(map[string]mockDocFreq)
|
||||
index.termIndex[termString] = fieldMap
|
||||
}
|
||||
docMap, ok := fieldMap[field.Name]
|
||||
if !ok {
|
||||
docMap = make(map[string]*mockFreq)
|
||||
fieldMap[field.Name] = docMap
|
||||
}
|
||||
docMap[doc.ID] = &mf
|
||||
backIndexInnerEntry := []string{field.Name, termString}
|
||||
backIndexEntry = append(backIndexEntry, backIndexInnerEntry)
|
||||
}
|
||||
}
|
||||
index.backIndex[doc.ID] = backIndexEntry
|
||||
index.docCount += 1
|
||||
return nil
|
||||
}
|
||||
|
||||
func (index *MockIndex) Delete(id string) error {
|
||||
backIndexEntry, existed := index.backIndex[id]
|
||||
if existed {
|
||||
for _, backIndexPair := range backIndexEntry {
|
||||
if len(backIndexPair) == 2 {
|
||||
field := backIndexPair[0]
|
||||
term := backIndexPair[1]
|
||||
delete(index.termIndex[term][field], id)
|
||||
if len(index.termIndex[term][field]) == 0 {
|
||||
delete(index.termIndex[term], field)
|
||||
if len(index.termIndex[term]) == 0 {
|
||||
delete(index.termIndex, term)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
delete(index.backIndex, id)
|
||||
index.docCount -= 1
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (index *MockIndex) TermFieldReader(term []byte, field string) (index.TermFieldReader, error) {
|
||||
|
||||
fdf, ok := index.termIndex[string(term)]
|
||||
if !ok {
|
||||
fdf = make(mockFieldDocFreq)
|
||||
}
|
||||
docFreqs, ok := fdf[field]
|
||||
if !ok {
|
||||
docFreqs = make(mockDocFreq)
|
||||
}
|
||||
mtfr := mockTermFieldReader{
|
||||
index: docFreqs,
|
||||
sortedDocIds: make(sort.StringSlice, len(docFreqs)),
|
||||
curr: -1,
|
||||
}
|
||||
i := 0
|
||||
for k, _ := range docFreqs {
|
||||
mtfr.sortedDocIds[i] = k
|
||||
i += 1
|
||||
}
|
||||
sort.Sort(mtfr.sortedDocIds)
|
||||
|
||||
return &mtfr, nil
|
||||
}
|
||||
|
||||
func (index *MockIndex) DocCount() uint64 {
|
||||
return index.docCount
|
||||
}
|
||||
|
||||
type mockTermFieldReader struct {
|
||||
index mockDocFreq
|
||||
sortedDocIds sort.StringSlice
|
||||
curr int
|
||||
}
|
||||
|
||||
func (reader *mockTermFieldReader) Next() (*index.TermFieldDoc, error) {
|
||||
next := reader.curr + 1
|
||||
if next < len(reader.sortedDocIds) {
|
||||
nextTermKey := reader.sortedDocIds[next]
|
||||
nextTerm := reader.index[nextTermKey]
|
||||
reader.curr = next
|
||||
return &index.TermFieldDoc{ID: nextTermKey, Freq: nextTerm.freq, Norm: nextTerm.norm, Vectors: nextTerm.vectors}, nil
|
||||
}
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (reader *mockTermFieldReader) Advance(ID string) (*index.TermFieldDoc, error) {
|
||||
if reader.curr >= len(reader.sortedDocIds) {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
i := reader.curr
|
||||
for currTermID := reader.sortedDocIds[i]; currTermID < ID && i < len(reader.sortedDocIds); i += 1 {
|
||||
reader.curr = i
|
||||
currTermID = reader.sortedDocIds[reader.curr]
|
||||
}
|
||||
|
||||
if reader.curr < len(reader.sortedDocIds) {
|
||||
nextTermKey := reader.sortedDocIds[reader.curr]
|
||||
nextTerm := reader.index[nextTermKey]
|
||||
return &index.TermFieldDoc{ID: nextTermKey, Freq: nextTerm.freq, Norm: nextTerm.norm, Vectors: nextTerm.vectors}, nil
|
||||
}
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (reader *mockTermFieldReader) Count() uint64 {
|
||||
return uint64(len(reader.sortedDocIds))
|
||||
}
|
||||
|
||||
func (reader *mockTermFieldReader) Close() {}
|
||||
|
||||
func (mi *MockIndex) mockVectorsFromTokenFreq(field string, tf *analysis.TokenFreq) []*index.TermFieldVector {
|
||||
rv := make([]*index.TermFieldVector, len(tf.Locations))
|
||||
|
||||
for i, l := range tf.Locations {
|
||||
mv := index.TermFieldVector{
|
||||
Field: field,
|
||||
Pos: uint64(l.Position),
|
||||
Start: uint64(l.Start),
|
||||
End: uint64(l.End),
|
||||
}
|
||||
rv[i] = &mv
|
||||
}
|
||||
|
||||
return rv
|
||||
}
|
||||
|
||||
func (mi *MockIndex) Dump() {
|
||||
fmt.Println("dump not implemented")
|
||||
}
|
|
@ -0,0 +1,124 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
package mock
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
_ "github.com/couchbaselabs/bleve/analysis/analyzers/standard_analyzer"
|
||||
"github.com/couchbaselabs/bleve/document"
|
||||
"github.com/couchbaselabs/bleve/index"
|
||||
)
|
||||
|
||||
func TestCRUD(t *testing.T) {
|
||||
i := NewMockIndex()
|
||||
|
||||
// create doc, assert doc count goes up
|
||||
doc1 := document.NewDocument("1")
|
||||
doc1.AddField(document.NewTextField("name", []byte("marty")))
|
||||
i.Update(doc1)
|
||||
count := i.DocCount()
|
||||
if count != 1 {
|
||||
t.Errorf("expected document count to be 1, was: %d", count)
|
||||
}
|
||||
|
||||
// add another doc, assert doc count goes up again
|
||||
doc2 := document.NewDocument("2")
|
||||
doc2.AddField(document.NewTextField("name", []byte("bob")))
|
||||
i.Update(doc2)
|
||||
count = i.DocCount()
|
||||
if count != 2 {
|
||||
t.Errorf("expected document count to be 2, was: %d", count)
|
||||
}
|
||||
|
||||
// search for doc with term that should exist
|
||||
expectedMatch := &index.TermFieldDoc{
|
||||
ID: "1",
|
||||
Freq: 1,
|
||||
Norm: 1,
|
||||
}
|
||||
tfr, err := i.TermFieldReader([]byte("marty"), "name")
|
||||
if err != nil {
|
||||
t.Errorf("unexpected error: %v", err)
|
||||
}
|
||||
match, err := tfr.Next()
|
||||
if err != nil {
|
||||
t.Errorf("unexpected error: %v", err)
|
||||
}
|
||||
if !reflect.DeepEqual(expectedMatch, match) {
|
||||
t.Errorf("got %v, expected %v", match, expectedMatch)
|
||||
}
|
||||
nomatch, err := tfr.Next()
|
||||
if err != nil {
|
||||
t.Errorf("unexpected error: %v", err)
|
||||
}
|
||||
if nomatch != nil {
|
||||
t.Errorf("expected nil after last match")
|
||||
}
|
||||
|
||||
// update doc, assert doc count doesn't go up
|
||||
doc1 = document.NewDocument("1")
|
||||
doc1.AddField(document.NewTextField("name", []byte("salad")))
|
||||
doc1.AddField(document.NewTextFieldWithIndexingOptions("desc", []byte("eat more rice"), document.INDEX_FIELD|document.INCLUDE_TERM_VECTORS))
|
||||
i.Update(doc1)
|
||||
count = i.DocCount()
|
||||
if count != 2 {
|
||||
t.Errorf("expected document count to be 2, was: %d", count)
|
||||
}
|
||||
|
||||
// perform the original search again, should NOT find anything this time
|
||||
tfr, err = i.TermFieldReader([]byte("marty"), "name")
|
||||
if err != nil {
|
||||
t.Errorf("unexpected error: %v", err)
|
||||
}
|
||||
nomatch, err = tfr.Next()
|
||||
if err != nil {
|
||||
t.Errorf("unexpected error: %v", err)
|
||||
}
|
||||
if nomatch != nil {
|
||||
t.Errorf("expected no matches, found one")
|
||||
t.Logf("%v", i)
|
||||
}
|
||||
|
||||
// delete a doc, ensure the count is 1
|
||||
err = i.Delete("2")
|
||||
if err != nil {
|
||||
t.Errorf("unexpected error: %v", err)
|
||||
}
|
||||
count = i.DocCount()
|
||||
if count != 1 {
|
||||
t.Errorf("expected document count to be 1, was: %d", count)
|
||||
}
|
||||
|
||||
expectedMatch = &index.TermFieldDoc{
|
||||
ID: "1",
|
||||
Freq: 1,
|
||||
Norm: 0.5773502691896258,
|
||||
Vectors: []*index.TermFieldVector{
|
||||
&index.TermFieldVector{
|
||||
Field: "desc",
|
||||
Pos: 3,
|
||||
Start: 9,
|
||||
End: 13,
|
||||
},
|
||||
},
|
||||
}
|
||||
tfr, err = i.TermFieldReader([]byte("rice"), "desc")
|
||||
if err != nil {
|
||||
t.Errorf("unexpected error: %v", err)
|
||||
}
|
||||
match, err = tfr.Next()
|
||||
if err != nil {
|
||||
t.Errorf("unexpected error: %v", err)
|
||||
}
|
||||
if !reflect.DeepEqual(expectedMatch, match) {
|
||||
t.Errorf("got %#v, expected %#v", match, expectedMatch)
|
||||
}
|
||||
}
|
|
@ -0,0 +1,101 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
package upside_down
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
|
||||
"github.com/jmhodges/levigo"
|
||||
|
||||
"github.com/couchbaselabs/bleve/index"
|
||||
)
|
||||
|
||||
type UpsideDownCouchTermFieldReader struct {
|
||||
index *UpsideDownCouch
|
||||
iterator *levigo.Iterator
|
||||
count uint64
|
||||
term []byte
|
||||
field uint16
|
||||
}
|
||||
|
||||
func newUpsideDownCouchTermFieldReader(index *UpsideDownCouch, term []byte, field uint16) (*UpsideDownCouchTermFieldReader, error) {
|
||||
ro := defaultReadOptions()
|
||||
it := index.db.NewIterator(ro)
|
||||
|
||||
tfr := NewTermFrequencyRow(term, field, "", 0, 0)
|
||||
it.Seek(tfr.Key())
|
||||
|
||||
var count uint64 = 0
|
||||
if it.Valid() {
|
||||
if bytes.Equal(it.Key(), tfr.Key()) {
|
||||
tfr = ParseFromKeyValue(it.Key(), it.Value()).(*TermFrequencyRow)
|
||||
count = tfr.freq
|
||||
}
|
||||
|
||||
} else {
|
||||
return nil, it.GetError()
|
||||
}
|
||||
|
||||
return &UpsideDownCouchTermFieldReader{
|
||||
index: index,
|
||||
iterator: it,
|
||||
count: count,
|
||||
term: term,
|
||||
field: field,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (r *UpsideDownCouchTermFieldReader) Count() uint64 {
|
||||
return r.count
|
||||
}
|
||||
|
||||
func (r *UpsideDownCouchTermFieldReader) Next() (*index.TermFieldDoc, error) {
|
||||
r.iterator.Next()
|
||||
if r.iterator.Valid() {
|
||||
tfr := NewTermFrequencyRow(r.term, r.field, "", 0, 0)
|
||||
if !bytes.HasPrefix(r.iterator.Key(), tfr.Key()) {
|
||||
// end of the line
|
||||
return nil, nil
|
||||
}
|
||||
tfr = ParseFromKeyValue(r.iterator.Key(), r.iterator.Value()).(*TermFrequencyRow)
|
||||
return &index.TermFieldDoc{
|
||||
ID: string(tfr.doc),
|
||||
Freq: tfr.freq,
|
||||
Norm: float64(tfr.norm),
|
||||
Vectors: r.index.termFieldVectorsFromTermVectors(tfr.vectors),
|
||||
}, nil
|
||||
} else {
|
||||
return nil, r.iterator.GetError()
|
||||
}
|
||||
}
|
||||
|
||||
func (r *UpsideDownCouchTermFieldReader) Advance(docId string) (*index.TermFieldDoc, error) {
|
||||
tfr := NewTermFrequencyRow(r.term, r.field, docId, 0, 0)
|
||||
r.iterator.Seek(tfr.Key())
|
||||
if r.iterator.Valid() {
|
||||
tfr := NewTermFrequencyRow(r.term, r.field, "", 0, 0)
|
||||
if !bytes.HasPrefix(r.iterator.Key(), tfr.Key()) {
|
||||
// end of the line
|
||||
return nil, nil
|
||||
}
|
||||
tfr = ParseFromKeyValue(r.iterator.Key(), r.iterator.Value()).(*TermFrequencyRow)
|
||||
return &index.TermFieldDoc{
|
||||
ID: string(tfr.doc),
|
||||
Freq: tfr.freq,
|
||||
Norm: float64(tfr.norm),
|
||||
Vectors: r.index.termFieldVectorsFromTermVectors(tfr.vectors),
|
||||
}, nil
|
||||
} else {
|
||||
return nil, r.iterator.GetError()
|
||||
}
|
||||
}
|
||||
|
||||
func (r *UpsideDownCouchTermFieldReader) Close() {
|
||||
r.iterator.Close()
|
||||
}
|
|
@ -0,0 +1,111 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
package upside_down
|
||||
|
||||
import (
|
||||
"os"
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
_ "github.com/couchbaselabs/bleve/analysis/analyzers/standard_analyzer"
|
||||
"github.com/couchbaselabs/bleve/document"
|
||||
"github.com/couchbaselabs/bleve/index"
|
||||
)
|
||||
|
||||
func TestIndexReader(t *testing.T) {
|
||||
defer os.RemoveAll("test")
|
||||
|
||||
idx := NewUpsideDownCouch("test")
|
||||
|
||||
err := idx.Open()
|
||||
if err != nil {
|
||||
t.Errorf("error opening index: %v", err)
|
||||
}
|
||||
defer idx.Close()
|
||||
|
||||
var expectedCount uint64 = 0
|
||||
doc := document.NewDocument("1")
|
||||
doc.AddField(document.NewTextField("name", []byte("test")))
|
||||
err = idx.Update(doc)
|
||||
if err != nil {
|
||||
t.Errorf("Error updating index: %v", err)
|
||||
}
|
||||
expectedCount += 1
|
||||
|
||||
doc = document.NewDocument("2")
|
||||
doc.AddField(document.NewTextField("name", []byte("test test test")))
|
||||
doc.AddField(document.NewTextFieldWithIndexingOptions("desc", []byte("eat more rice"), document.INDEX_FIELD|document.INCLUDE_TERM_VECTORS))
|
||||
err = idx.Update(doc)
|
||||
if err != nil {
|
||||
t.Errorf("Error updating index: %v", err)
|
||||
}
|
||||
expectedCount += 1
|
||||
|
||||
// first look for a term that doesnt exist
|
||||
reader, err := idx.TermFieldReader([]byte("nope"), "name")
|
||||
if err != nil {
|
||||
t.Errorf("Error accessing term field reader: %v", err)
|
||||
}
|
||||
count := reader.Count()
|
||||
if count != 0 {
|
||||
t.Errorf("Expected doc count to be: %d got: %d", 0, count)
|
||||
}
|
||||
reader.Close()
|
||||
|
||||
reader, err = idx.TermFieldReader([]byte("test"), "name")
|
||||
if err != nil {
|
||||
t.Errorf("Error accessing term field reader: %v", err)
|
||||
}
|
||||
defer reader.Close()
|
||||
|
||||
expectedCount = 2
|
||||
count = reader.Count()
|
||||
if count != expectedCount {
|
||||
t.Errorf("Exptected doc count to be: %d got: %d", expectedCount, count)
|
||||
}
|
||||
|
||||
var match *index.TermFieldDoc
|
||||
var actualCount uint64
|
||||
match, err = reader.Next()
|
||||
for err == nil && match != nil {
|
||||
match, err = reader.Next()
|
||||
if err != nil {
|
||||
t.Errorf("unexpected error reading next")
|
||||
}
|
||||
actualCount += 1
|
||||
}
|
||||
if actualCount != count {
|
||||
t.Errorf("count was 2, but only saw %d", actualCount)
|
||||
}
|
||||
|
||||
expectedMatch := &index.TermFieldDoc{
|
||||
ID: "2",
|
||||
Freq: 1,
|
||||
Norm: 0.5773502588272095,
|
||||
Vectors: []*index.TermFieldVector{
|
||||
&index.TermFieldVector{
|
||||
Field: "desc",
|
||||
Pos: 3,
|
||||
Start: 9,
|
||||
End: 13,
|
||||
},
|
||||
},
|
||||
}
|
||||
tfr, err := idx.TermFieldReader([]byte("rice"), "desc")
|
||||
if err != nil {
|
||||
t.Errorf("unexpected error: %v", err)
|
||||
}
|
||||
match, err = tfr.Next()
|
||||
if err != nil {
|
||||
t.Errorf("unexpected error: %v", err)
|
||||
}
|
||||
if !reflect.DeepEqual(expectedMatch, match) {
|
||||
t.Errorf("got %#v, expected %#v", match, expectedMatch)
|
||||
}
|
||||
}
|
|
@ -0,0 +1,412 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
package upside_down
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/binary"
|
||||
"fmt"
|
||||
"io"
|
||||
)
|
||||
|
||||
const BYTE_SEPARATOR byte = 0xff
|
||||
|
||||
type UpsideDownCouchRowStream chan UpsideDownCouchRow
|
||||
|
||||
type UpsideDownCouchRow interface {
|
||||
Key() []byte
|
||||
Value() []byte
|
||||
}
|
||||
|
||||
func ParseFromKeyValue(key, value []byte) UpsideDownCouchRow {
|
||||
switch key[0] {
|
||||
case 'v':
|
||||
return NewVersionRowKV(key, value)
|
||||
case 'f':
|
||||
return NewFieldRowKV(key, value)
|
||||
case 't':
|
||||
return NewTermFrequencyRowKV(key, value)
|
||||
case 'b':
|
||||
return NewBackIndexRowKV(key, value)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// VERSION
|
||||
|
||||
type VersionRow struct {
|
||||
version uint8
|
||||
}
|
||||
|
||||
func (v *VersionRow) Key() []byte {
|
||||
return []byte{'v'}
|
||||
}
|
||||
|
||||
func (v *VersionRow) Value() []byte {
|
||||
buf := new(bytes.Buffer)
|
||||
err := binary.Write(buf, binary.LittleEndian, v.version)
|
||||
if err != nil {
|
||||
panic(fmt.Sprintf("binary.Write failed: %v", err))
|
||||
}
|
||||
return buf.Bytes()
|
||||
}
|
||||
|
||||
func (v *VersionRow) String() string {
|
||||
return fmt.Sprintf("Version: %d", v.version)
|
||||
}
|
||||
|
||||
func NewVersionRow(version uint8) *VersionRow {
|
||||
return &VersionRow{
|
||||
version: version,
|
||||
}
|
||||
}
|
||||
|
||||
func NewVersionRowKV(key, value []byte) *VersionRow {
|
||||
rv := VersionRow{}
|
||||
buf := bytes.NewBuffer(value)
|
||||
err := binary.Read(buf, binary.LittleEndian, &rv.version)
|
||||
if err != nil {
|
||||
panic(fmt.Sprintf("binary.Read failed: %v", err))
|
||||
}
|
||||
return &rv
|
||||
}
|
||||
|
||||
// FIELD definition
|
||||
|
||||
type FieldRow struct {
|
||||
index uint16
|
||||
name string
|
||||
}
|
||||
|
||||
func (f *FieldRow) Key() []byte {
|
||||
buf := new(bytes.Buffer)
|
||||
err := buf.WriteByte('f')
|
||||
if err != nil {
|
||||
panic(fmt.Sprintf("Buffer.WriteByte failed: %v", err))
|
||||
}
|
||||
err = binary.Write(buf, binary.LittleEndian, f.index)
|
||||
if err != nil {
|
||||
panic(fmt.Sprintf("binary.Write failed: %v", err))
|
||||
}
|
||||
return buf.Bytes()
|
||||
}
|
||||
|
||||
func (f *FieldRow) Value() []byte {
|
||||
buf := new(bytes.Buffer)
|
||||
_, err := buf.WriteString(f.name)
|
||||
if err != nil {
|
||||
panic(fmt.Sprintf("Buffer.WriteString failed: %v", err))
|
||||
}
|
||||
err = buf.WriteByte(BYTE_SEPARATOR)
|
||||
if err != nil {
|
||||
panic(fmt.Sprintf("Buffer.WriteByte failed: %v", err))
|
||||
}
|
||||
return buf.Bytes()
|
||||
}
|
||||
|
||||
func (f *FieldRow) String() string {
|
||||
return fmt.Sprintf("Field: %d Name: %s", f.index, f.name)
|
||||
}
|
||||
|
||||
func NewFieldRow(index uint16, name string) *FieldRow {
|
||||
return &FieldRow{
|
||||
index: index,
|
||||
name: name,
|
||||
}
|
||||
}
|
||||
|
||||
func NewFieldRowKV(key, value []byte) *FieldRow {
|
||||
rv := FieldRow{}
|
||||
|
||||
buf := bytes.NewBuffer(key)
|
||||
buf.ReadByte() // type
|
||||
err := binary.Read(buf, binary.LittleEndian, &rv.index)
|
||||
if err != nil {
|
||||
panic(fmt.Sprintf("binary.Read failed: %v", err))
|
||||
}
|
||||
|
||||
buf = bytes.NewBuffer(value)
|
||||
rv.name, err = buf.ReadString(BYTE_SEPARATOR)
|
||||
if err != nil {
|
||||
panic(fmt.Sprintf("Buffer.ReadString failed: %v", err))
|
||||
}
|
||||
rv.name = rv.name[:len(rv.name)-1] // trim off separator byte
|
||||
|
||||
return &rv
|
||||
}
|
||||
|
||||
// TERM FIELD FREQUENCY
|
||||
|
||||
type TermVector struct {
|
||||
field uint16
|
||||
pos uint64
|
||||
start uint64
|
||||
end uint64
|
||||
}
|
||||
|
||||
func (tv *TermVector) String() string {
|
||||
return fmt.Sprintf("Field: %d Pos: %d Start: %d End %d", tv.field, tv.pos, tv.start, tv.end)
|
||||
}
|
||||
|
||||
type TermFrequencyRow struct {
|
||||
term []byte
|
||||
field uint16
|
||||
doc []byte
|
||||
freq uint64
|
||||
norm float32
|
||||
vectors []*TermVector
|
||||
}
|
||||
|
||||
func (tfr *TermFrequencyRow) Key() []byte {
|
||||
buf := new(bytes.Buffer)
|
||||
err := buf.WriteByte('t')
|
||||
if err != nil {
|
||||
panic(fmt.Sprintf("Buffer.WriteByte failed: %v", err))
|
||||
}
|
||||
_, err = buf.Write(tfr.term)
|
||||
if err != nil {
|
||||
panic(fmt.Sprintf("Buffer.Write failed: %v", err))
|
||||
}
|
||||
err = buf.WriteByte(BYTE_SEPARATOR)
|
||||
if err != nil {
|
||||
panic(fmt.Sprintf("Buffer.WriteByte failed: %v", err))
|
||||
}
|
||||
err = binary.Write(buf, binary.LittleEndian, tfr.field)
|
||||
if err != nil {
|
||||
panic(fmt.Sprintf("binary.Write failed: %v", err))
|
||||
}
|
||||
_, err = buf.Write(tfr.doc)
|
||||
if err != nil {
|
||||
panic(fmt.Sprintf("Buffer.Write failed: %v", err))
|
||||
}
|
||||
return buf.Bytes()
|
||||
}
|
||||
|
||||
func (tfr *TermFrequencyRow) Value() []byte {
|
||||
buf := new(bytes.Buffer)
|
||||
err := binary.Write(buf, binary.LittleEndian, tfr.freq)
|
||||
if err != nil {
|
||||
panic(fmt.Sprintf("binary.Write failed: %v", err))
|
||||
}
|
||||
err = binary.Write(buf, binary.LittleEndian, tfr.norm)
|
||||
if err != nil {
|
||||
panic(fmt.Sprintf("binary.Write failed: %v", err))
|
||||
}
|
||||
for _, vector := range tfr.vectors {
|
||||
err = binary.Write(buf, binary.LittleEndian, vector.field)
|
||||
if err != nil {
|
||||
panic(fmt.Sprintf("binary.Write failed: %v", err))
|
||||
}
|
||||
err = binary.Write(buf, binary.LittleEndian, vector.pos)
|
||||
if err != nil {
|
||||
panic(fmt.Sprintf("binary.Write failed: %v", err))
|
||||
}
|
||||
err = binary.Write(buf, binary.LittleEndian, vector.start)
|
||||
if err != nil {
|
||||
panic(fmt.Sprintf("binary.Write failed: %v", err))
|
||||
}
|
||||
err = binary.Write(buf, binary.LittleEndian, vector.end)
|
||||
if err != nil {
|
||||
panic(fmt.Sprintf("binary.Write failed: %v", err))
|
||||
}
|
||||
}
|
||||
return buf.Bytes()
|
||||
}
|
||||
|
||||
func (tfr *TermFrequencyRow) String() string {
|
||||
return fmt.Sprintf("Term: `%s` Field: %d DocId: `%s` Frequency: %d Norm: %f Vectors: %v", string(tfr.term), tfr.field, string(tfr.doc), tfr.freq, tfr.norm, tfr.vectors)
|
||||
}
|
||||
|
||||
func NewTermFrequencyRow(term []byte, field uint16, doc string, freq uint64, norm float32) *TermFrequencyRow {
|
||||
return &TermFrequencyRow{
|
||||
term: term,
|
||||
field: field,
|
||||
doc: []byte(doc),
|
||||
freq: freq,
|
||||
norm: norm,
|
||||
}
|
||||
}
|
||||
|
||||
func NewTermFrequencyRowWithTermVectors(term []byte, field uint16, doc string, freq uint64, norm float32, vectors []*TermVector) *TermFrequencyRow {
|
||||
return &TermFrequencyRow{
|
||||
term: term,
|
||||
field: field,
|
||||
doc: []byte(doc),
|
||||
freq: freq,
|
||||
norm: norm,
|
||||
vectors: vectors,
|
||||
}
|
||||
}
|
||||
|
||||
func NewTermFrequencyRowKV(key, value []byte) *TermFrequencyRow {
|
||||
rv := TermFrequencyRow{
|
||||
doc: []byte(""),
|
||||
}
|
||||
buf := bytes.NewBuffer(key)
|
||||
buf.ReadByte() // type
|
||||
|
||||
var err error
|
||||
rv.term, err = buf.ReadBytes(BYTE_SEPARATOR)
|
||||
if err != nil {
|
||||
panic(fmt.Sprintf("Buffer.ReadString failed: %v", err))
|
||||
}
|
||||
rv.term = rv.term[:len(rv.term)-1] // trim off separator byte
|
||||
|
||||
err = binary.Read(buf, binary.LittleEndian, &rv.field)
|
||||
if err != nil {
|
||||
panic(fmt.Sprintf("binary.Read failed: %v", err))
|
||||
}
|
||||
|
||||
doc, err := buf.ReadBytes(BYTE_SEPARATOR)
|
||||
if err != io.EOF {
|
||||
panic(fmt.Sprintf("expected binary.ReadString to end in EOF: %v", err))
|
||||
}
|
||||
if doc != nil {
|
||||
rv.doc = doc
|
||||
}
|
||||
|
||||
buf = bytes.NewBuffer((value))
|
||||
err = binary.Read(buf, binary.LittleEndian, &rv.freq)
|
||||
if err != nil {
|
||||
panic(fmt.Sprintf("binary.Read failed: %v", err))
|
||||
}
|
||||
err = binary.Read(buf, binary.LittleEndian, &rv.norm)
|
||||
if err != nil {
|
||||
panic(fmt.Sprintf("binary.Read failed: %v", err))
|
||||
}
|
||||
|
||||
var field uint16
|
||||
err = binary.Read(buf, binary.LittleEndian, &field)
|
||||
if err != nil && err != io.EOF {
|
||||
panic(fmt.Sprintf("binary.Read failed: %v", err))
|
||||
}
|
||||
for err != io.EOF {
|
||||
tv := TermVector{}
|
||||
tv.field = field
|
||||
// at this point we expect at least one term vector
|
||||
if rv.vectors == nil {
|
||||
rv.vectors = make([]*TermVector, 0)
|
||||
}
|
||||
|
||||
err = binary.Read(buf, binary.LittleEndian, &tv.pos)
|
||||
if err != nil {
|
||||
panic(fmt.Sprintf("binary.Read failed: %v", err))
|
||||
}
|
||||
err = binary.Read(buf, binary.LittleEndian, &tv.start)
|
||||
if err != nil {
|
||||
panic(fmt.Sprintf("binary.Read failed: %v", err))
|
||||
}
|
||||
err = binary.Read(buf, binary.LittleEndian, &tv.end)
|
||||
if err != nil {
|
||||
panic(fmt.Sprintf("binary.Read failed: %v", err))
|
||||
}
|
||||
rv.vectors = append(rv.vectors, &tv)
|
||||
// try to read next record (may not exist)
|
||||
err = binary.Read(buf, binary.LittleEndian, &field)
|
||||
}
|
||||
|
||||
return &rv
|
||||
|
||||
}
|
||||
|
||||
type BackIndexEntry struct {
|
||||
term []byte
|
||||
field uint16
|
||||
}
|
||||
|
||||
func (bie *BackIndexEntry) String() string {
|
||||
return fmt.Sprintf("Term: `%s` Field: %d", string(bie.term), bie.field)
|
||||
}
|
||||
|
||||
type BackIndexRow struct {
|
||||
doc []byte
|
||||
entries []*BackIndexEntry
|
||||
}
|
||||
|
||||
func (br *BackIndexRow) Key() []byte {
|
||||
buf := new(bytes.Buffer)
|
||||
err := buf.WriteByte('b')
|
||||
if err != nil {
|
||||
panic(fmt.Sprintf("Buffer.WriteByte failed: %v", err))
|
||||
}
|
||||
err = binary.Write(buf, binary.LittleEndian, br.doc)
|
||||
if err != nil {
|
||||
panic(fmt.Sprintf("binary.Write failed: %v", err))
|
||||
}
|
||||
return buf.Bytes()
|
||||
}
|
||||
|
||||
func (br *BackIndexRow) Value() []byte {
|
||||
buf := new(bytes.Buffer)
|
||||
for _, e := range br.entries {
|
||||
_, err := buf.Write(e.term)
|
||||
if err != nil {
|
||||
panic(fmt.Sprintf("Buffer.Write failed: %v", err))
|
||||
}
|
||||
err = buf.WriteByte(BYTE_SEPARATOR)
|
||||
if err != nil {
|
||||
panic(fmt.Sprintf("Buffer.WriteByte failed: %v", err))
|
||||
}
|
||||
err = binary.Write(buf, binary.LittleEndian, e.field)
|
||||
if err != nil {
|
||||
panic(fmt.Sprintf("binary.Write failed: %v", err))
|
||||
}
|
||||
}
|
||||
return buf.Bytes()
|
||||
}
|
||||
|
||||
func (br *BackIndexRow) String() string {
|
||||
return fmt.Sprintf("Backindex DocId: `%s` Entries: %v", string(br.doc), br.entries)
|
||||
}
|
||||
|
||||
func NewBackIndexRow(doc string, entries []*BackIndexEntry) *BackIndexRow {
|
||||
return &BackIndexRow{
|
||||
doc: []byte(doc),
|
||||
entries: entries,
|
||||
}
|
||||
}
|
||||
|
||||
func NewBackIndexRowKV(key, value []byte) *BackIndexRow {
|
||||
rv := BackIndexRow{}
|
||||
|
||||
buf := bytes.NewBuffer(key)
|
||||
buf.ReadByte() // type
|
||||
|
||||
var err error
|
||||
rv.doc, err = buf.ReadBytes(BYTE_SEPARATOR)
|
||||
if err != io.EOF {
|
||||
panic(fmt.Sprintf("expected binary.ReadString to end in EOF: %v", err))
|
||||
}
|
||||
|
||||
buf = bytes.NewBuffer(value)
|
||||
rv.entries = make([]*BackIndexEntry, 0)
|
||||
|
||||
var term []byte
|
||||
term, err = buf.ReadBytes(BYTE_SEPARATOR)
|
||||
if err != nil && err != io.EOF {
|
||||
panic(fmt.Sprintf("Buffer.ReadString failed: %v", err))
|
||||
}
|
||||
for err != io.EOF {
|
||||
ent := BackIndexEntry{}
|
||||
ent.term = term[:len(term)-1] // trim off separator byte
|
||||
|
||||
err = binary.Read(buf, binary.LittleEndian, &ent.field)
|
||||
if err != nil {
|
||||
panic(fmt.Sprintf("binary.Read failed: %v", err))
|
||||
}
|
||||
rv.entries = append(rv.entries, &ent)
|
||||
|
||||
term, err = buf.ReadBytes(BYTE_SEPARATOR)
|
||||
if err != nil && err != io.EOF {
|
||||
panic(fmt.Sprintf("Buffer.ReadString failed: %v", err))
|
||||
}
|
||||
}
|
||||
|
||||
return &rv
|
||||
}
|
|
@ -0,0 +1,89 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
package upside_down
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestRows(t *testing.T) {
|
||||
tests := []struct {
|
||||
input UpsideDownCouchRow
|
||||
outKey []byte
|
||||
outVal []byte
|
||||
}{
|
||||
{
|
||||
NewVersionRow(1),
|
||||
[]byte{'v'},
|
||||
[]byte{0x1},
|
||||
},
|
||||
{
|
||||
NewFieldRow(0, "name"),
|
||||
[]byte{'f', 0, 0},
|
||||
[]byte{'n', 'a', 'm', 'e', BYTE_SEPARATOR},
|
||||
},
|
||||
{
|
||||
NewFieldRow(1, "desc"),
|
||||
[]byte{'f', 1, 0},
|
||||
[]byte{'d', 'e', 's', 'c', BYTE_SEPARATOR},
|
||||
},
|
||||
{
|
||||
NewFieldRow(513, "style"),
|
||||
[]byte{'f', 1, 2},
|
||||
[]byte{'s', 't', 'y', 'l', 'e', BYTE_SEPARATOR},
|
||||
},
|
||||
{
|
||||
NewTermFrequencyRow([]byte{'b', 'e', 'e', 'r'}, 0, "", 3, 3.14),
|
||||
[]byte{'t', 'b', 'e', 'e', 'r', BYTE_SEPARATOR, 0, 0},
|
||||
[]byte{3, 0, 0, 0, 0, 0, 0, 0, 195, 245, 72, 64},
|
||||
},
|
||||
{
|
||||
NewTermFrequencyRow([]byte{'b', 'e', 'e', 'r'}, 0, "budweiser", 3, 3.14),
|
||||
[]byte{'t', 'b', 'e', 'e', 'r', BYTE_SEPARATOR, 0, 0, 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'},
|
||||
[]byte{3, 0, 0, 0, 0, 0, 0, 0, 195, 245, 72, 64},
|
||||
},
|
||||
{
|
||||
NewTermFrequencyRowWithTermVectors([]byte{'b', 'e', 'e', 'r'}, 0, "budweiser", 3, 3.14, []*TermVector{&TermVector{field: 0, pos: 1, start: 3, end: 11}, &TermVector{field: 0, pos: 2, start: 23, end: 31}, &TermVector{field: 0, pos: 3, start: 43, end: 51}}),
|
||||
[]byte{'t', 'b', 'e', 'e', 'r', BYTE_SEPARATOR, 0, 0, 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'},
|
||||
[]byte{3, 0, 0, 0, 0, 0, 0, 0, 195, 245, 72, 64, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 23, 0, 0, 0, 0, 0, 0, 0, 31, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 43, 0, 0, 0, 0, 0, 0, 0, 51, 0, 0, 0, 0, 0, 0, 0},
|
||||
},
|
||||
{
|
||||
NewBackIndexRow("budweiser", []*BackIndexEntry{&BackIndexEntry{[]byte{'b', 'e', 'e', 'r'}, 0}}),
|
||||
[]byte{'b', 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'},
|
||||
[]byte{'b', 'e', 'e', 'r', BYTE_SEPARATOR, 0, 0},
|
||||
},
|
||||
{
|
||||
NewBackIndexRow("budweiser", []*BackIndexEntry{&BackIndexEntry{[]byte{'b', 'e', 'e', 'r'}, 0}, &BackIndexEntry{[]byte{'b', 'e', 'a', 't'}, 1}}),
|
||||
[]byte{'b', 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'},
|
||||
[]byte{'b', 'e', 'e', 'r', BYTE_SEPARATOR, 0, 0, 'b', 'e', 'a', 't', BYTE_SEPARATOR, 1, 0},
|
||||
},
|
||||
}
|
||||
|
||||
// test going from struct to k/v bytes
|
||||
for _, test := range tests {
|
||||
rk := test.input.Key()
|
||||
if !reflect.DeepEqual(rk, test.outKey) {
|
||||
t.Errorf("Expected key to be %v got: %v", test.outKey, rk)
|
||||
}
|
||||
rv := test.input.Value()
|
||||
if !reflect.DeepEqual(rv, test.outVal) {
|
||||
t.Errorf("Expected value to be %v got: %v", test.outVal, rv)
|
||||
}
|
||||
}
|
||||
|
||||
// now test going back from k/v bytes to struct
|
||||
for _, test := range tests {
|
||||
row := ParseFromKeyValue(test.outKey, test.outVal)
|
||||
if !reflect.DeepEqual(row, test.input) {
|
||||
t.Fatalf("Expected: %#v got: %#v", test.input, row)
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,466 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
package upside_down
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"log"
|
||||
"math"
|
||||
|
||||
"github.com/couchbaselabs/bleve/analysis"
|
||||
"github.com/jmhodges/levigo"
|
||||
|
||||
"github.com/couchbaselabs/bleve/document"
|
||||
"github.com/couchbaselabs/bleve/index"
|
||||
)
|
||||
|
||||
var VERSION_KEY []byte = []byte{'v'}
|
||||
|
||||
const VERSION uint8 = 1
|
||||
|
||||
type UpsideDownCouch struct {
|
||||
version uint8
|
||||
path string
|
||||
opts *levigo.Options
|
||||
db *levigo.DB
|
||||
fieldIndexes map[string]uint16
|
||||
lastFieldIndex int
|
||||
analyzer map[string]*analysis.Analyzer
|
||||
docCount uint64
|
||||
}
|
||||
|
||||
func NewUpsideDownCouch(path string) *UpsideDownCouch {
|
||||
opts := levigo.NewOptions()
|
||||
opts.SetCreateIfMissing(true)
|
||||
|
||||
return &UpsideDownCouch{
|
||||
version: VERSION,
|
||||
path: path,
|
||||
opts: opts,
|
||||
analyzer: make(map[string]*analysis.Analyzer),
|
||||
fieldIndexes: make(map[string]uint16),
|
||||
}
|
||||
}
|
||||
|
||||
func (udc *UpsideDownCouch) init() (err error) {
|
||||
// prepare a list of rows
|
||||
rows := make([]UpsideDownCouchRow, 0)
|
||||
|
||||
// version marker
|
||||
rows = append(rows, NewVersionRow(udc.version))
|
||||
|
||||
return udc.batchRows(nil, rows, nil)
|
||||
}
|
||||
|
||||
func (udc *UpsideDownCouch) loadSchema() (err error) {
|
||||
// schema := make([]*index.Field, 0)
|
||||
|
||||
ro := defaultReadOptions()
|
||||
it := udc.db.NewIterator(ro)
|
||||
defer it.Close()
|
||||
|
||||
keyPrefix := []byte{'f'}
|
||||
it.Seek(keyPrefix)
|
||||
for it = it; it.Valid(); it.Next() {
|
||||
// stop when
|
||||
if !bytes.HasPrefix(it.Key(), keyPrefix) {
|
||||
break
|
||||
}
|
||||
fieldRow := NewFieldRowKV(it.Key(), it.Value())
|
||||
udc.fieldIndexes[fieldRow.name] = fieldRow.index
|
||||
if int(fieldRow.index) > udc.lastFieldIndex {
|
||||
udc.lastFieldIndex = int(fieldRow.index)
|
||||
}
|
||||
}
|
||||
err = it.GetError()
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
func (udc *UpsideDownCouch) batchRows(addRows []UpsideDownCouchRow, updateRows []UpsideDownCouchRow, deleteRows []UpsideDownCouchRow) (err error) {
|
||||
ro := defaultReadOptions()
|
||||
|
||||
// prepare batch
|
||||
wb := levigo.NewWriteBatch()
|
||||
|
||||
// add
|
||||
for _, row := range addRows {
|
||||
tfr, ok := row.(*TermFrequencyRow)
|
||||
if ok {
|
||||
// need to increment counter
|
||||
tr := NewTermFrequencyRow(tfr.term, tfr.field, "", 0, 0)
|
||||
val, err := udc.db.Get(ro, tr.Key())
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if val != nil {
|
||||
tr = ParseFromKeyValue(tr.Key(), val).(*TermFrequencyRow)
|
||||
tr.freq += 1 // incr
|
||||
} else {
|
||||
tr = NewTermFrequencyRow(tfr.term, tfr.field, "", 1, 0)
|
||||
}
|
||||
|
||||
// now add this to the batch
|
||||
wb.Put(tr.Key(), tr.Value())
|
||||
}
|
||||
wb.Put(row.Key(), row.Value())
|
||||
}
|
||||
|
||||
// update
|
||||
for _, row := range updateRows {
|
||||
wb.Put(row.Key(), row.Value())
|
||||
}
|
||||
|
||||
// delete
|
||||
for _, row := range deleteRows {
|
||||
tfr, ok := row.(*TermFrequencyRow)
|
||||
if ok {
|
||||
// need to decrement counter
|
||||
tr := NewTermFrequencyRow(tfr.term, tfr.field, "", 0, 0)
|
||||
val, err := udc.db.Get(ro, tr.Key())
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if val != nil {
|
||||
tr = ParseFromKeyValue(tr.Key(), val).(*TermFrequencyRow)
|
||||
tr.freq -= 1 // incr
|
||||
} else {
|
||||
log.Panic(fmt.Sprintf("unexpected missing row, deleting term, expected count row to exit: %v", tr.Key()))
|
||||
}
|
||||
|
||||
if tr.freq == 0 {
|
||||
wb.Delete(tr.Key())
|
||||
} else {
|
||||
// now add this to the batch
|
||||
wb.Put(tr.Key(), tr.Value())
|
||||
}
|
||||
|
||||
}
|
||||
wb.Delete(row.Key())
|
||||
}
|
||||
|
||||
// write out the batch
|
||||
wo := defaultWriteOptions()
|
||||
err = udc.db.Write(wo, wb)
|
||||
return
|
||||
}
|
||||
|
||||
func (udc *UpsideDownCouch) DocCount() uint64 {
|
||||
return udc.docCount
|
||||
}
|
||||
|
||||
func (udc *UpsideDownCouch) Open() (err error) {
|
||||
udc.db, err = levigo.Open(udc.path, udc.opts)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
|
||||
ro := defaultReadOptions()
|
||||
var value []byte
|
||||
value, err = udc.db.Get(ro, VERSION_KEY)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
|
||||
// init new index OR load schema
|
||||
if value == nil {
|
||||
err = udc.init()
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
} else {
|
||||
err = udc.loadSchema()
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
}
|
||||
// set doc count
|
||||
udc.docCount = udc.countDocs()
|
||||
return
|
||||
}
|
||||
|
||||
func (udc *UpsideDownCouch) countDocs() uint64 {
|
||||
ro := defaultReadOptions()
|
||||
ro.SetFillCache(false) // dont fill the cache with this
|
||||
it := udc.db.NewIterator(ro)
|
||||
defer it.Close()
|
||||
|
||||
// begining of back index
|
||||
it.Seek([]byte{'b'})
|
||||
|
||||
var rv uint64 = 0
|
||||
for it = it; it.Valid(); it.Next() {
|
||||
if !bytes.HasPrefix(it.Key(), []byte{'b'}) {
|
||||
break
|
||||
}
|
||||
rv += 1
|
||||
}
|
||||
return rv
|
||||
}
|
||||
|
||||
func (udc *UpsideDownCouch) rowCount() uint64 {
|
||||
ro := defaultReadOptions()
|
||||
ro.SetFillCache(false) // dont fill the cache with this
|
||||
it := udc.db.NewIterator(ro)
|
||||
defer it.Close()
|
||||
|
||||
it.Seek([]byte{0})
|
||||
|
||||
var rv uint64 = 0
|
||||
for it = it; it.Valid(); it.Next() {
|
||||
rv += 1
|
||||
}
|
||||
return rv
|
||||
}
|
||||
|
||||
func (udc *UpsideDownCouch) Close() {
|
||||
udc.db.Close()
|
||||
}
|
||||
|
||||
func (udc *UpsideDownCouch) Update(doc *document.Document) error {
|
||||
// first we lookup the backindex row for the doc id if it exists
|
||||
// lookup the back index row
|
||||
backIndexRow, err := udc.backIndexRowForDoc(doc.ID)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
var isAdd = true
|
||||
// a map for each field, map key is term (string) bool true for existence
|
||||
// FIMXE hard-coded to max of 256 fields
|
||||
existingTermFieldMaps := make([]map[string]bool, 256)
|
||||
if backIndexRow != nil {
|
||||
isAdd = false
|
||||
for _, entry := range backIndexRow.entries {
|
||||
existingTermFieldMap := existingTermFieldMaps[entry.field]
|
||||
if existingTermFieldMap == nil {
|
||||
existingTermFieldMap = make(map[string]bool, 0)
|
||||
existingTermFieldMaps[entry.field] = existingTermFieldMap
|
||||
}
|
||||
existingTermFieldMap[string(entry.term)] = true
|
||||
}
|
||||
}
|
||||
|
||||
// prepare a list of rows
|
||||
updateRows := make([]UpsideDownCouchRow, 0)
|
||||
addRows := make([]UpsideDownCouchRow, 0)
|
||||
|
||||
// track our back index entries
|
||||
backIndexEntries := make([]*BackIndexEntry, 0)
|
||||
|
||||
for _, field := range doc.Fields {
|
||||
fieldIndex, fieldExists := udc.fieldIndexes[field.Name]
|
||||
if !fieldExists {
|
||||
// assign next field id
|
||||
fieldIndex = uint16(udc.lastFieldIndex + 1)
|
||||
udc.fieldIndexes[field.Name] = fieldIndex
|
||||
// ensure this batch adds a row for this field
|
||||
row := NewFieldRow(uint16(fieldIndex), field.Name)
|
||||
updateRows = append(updateRows, row)
|
||||
udc.lastFieldIndex = int(fieldIndex)
|
||||
}
|
||||
|
||||
existingTermFieldMap := existingTermFieldMaps[fieldIndex]
|
||||
|
||||
analyzer := field.Analyzer
|
||||
tokens := analyzer.Analyze(field.Value)
|
||||
fieldLength := len(tokens) // number of tokens in this doc field
|
||||
fieldNorm := float32(1.0 / math.Sqrt(float64(fieldLength)))
|
||||
tokenFreqs := analysis.TokenFrequency(tokens)
|
||||
for _, tf := range tokenFreqs {
|
||||
var termFreqRow *TermFrequencyRow
|
||||
if document.IncludeTermVectors(field.IndexingOptions) {
|
||||
tv := termVectorsFromTokenFreq(uint16(fieldIndex), tf)
|
||||
termFreqRow = NewTermFrequencyRowWithTermVectors(tf.Term, uint16(fieldIndex), doc.ID, uint64(frequencyFromTokenFreq(tf)), fieldNorm, tv)
|
||||
} else {
|
||||
termFreqRow = NewTermFrequencyRow(tf.Term, uint16(fieldIndex), doc.ID, uint64(frequencyFromTokenFreq(tf)), fieldNorm)
|
||||
}
|
||||
|
||||
// record the back index entry
|
||||
backIndexEntry := BackIndexEntry{tf.Term, uint16(fieldIndex)}
|
||||
backIndexEntries = append(backIndexEntries, &backIndexEntry)
|
||||
|
||||
// remove the entry from the map of existing term fields if it exists
|
||||
if existingTermFieldMap != nil {
|
||||
termString := string(tf.Term)
|
||||
_, ok := existingTermFieldMap[termString]
|
||||
if ok {
|
||||
// this is an update
|
||||
updateRows = append(updateRows, termFreqRow)
|
||||
// this term existed last time, delete it from that map
|
||||
delete(existingTermFieldMap, termString)
|
||||
} else {
|
||||
// this is an add
|
||||
addRows = append(addRows, termFreqRow)
|
||||
}
|
||||
} else {
|
||||
// this is an add
|
||||
addRows = append(addRows, termFreqRow)
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// build the back index row
|
||||
backIndexRow = NewBackIndexRow(doc.ID, backIndexEntries)
|
||||
updateRows = append(updateRows, backIndexRow)
|
||||
|
||||
// any of the existing rows that weren't updated need to be deleted
|
||||
deleteRows := make([]UpsideDownCouchRow, 0)
|
||||
for fieldIndex, existingTermFieldMap := range existingTermFieldMaps {
|
||||
if existingTermFieldMap != nil {
|
||||
for termString, _ := range existingTermFieldMap {
|
||||
termFreqRow := NewTermFrequencyRow([]byte(termString), uint16(fieldIndex), doc.ID, 0, 0)
|
||||
deleteRows = append(deleteRows, termFreqRow)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
err = udc.batchRows(addRows, updateRows, deleteRows)
|
||||
if err == nil && isAdd {
|
||||
udc.docCount += 1
|
||||
}
|
||||
return err
|
||||
}
|
||||
|
||||
func (udc *UpsideDownCouch) Delete(id string) error {
|
||||
// lookup the back index row
|
||||
backIndexRow, err := udc.backIndexRowForDoc(id)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if backIndexRow == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
// prepare a list of rows to delete
|
||||
rows := make([]UpsideDownCouchRow, 0)
|
||||
for _, backIndexEntry := range backIndexRow.entries {
|
||||
tfr := NewTermFrequencyRow(backIndexEntry.term, backIndexEntry.field, id, 0, 0)
|
||||
rows = append(rows, tfr)
|
||||
}
|
||||
|
||||
// also delete the back entry itself
|
||||
rows = append(rows, backIndexRow)
|
||||
|
||||
err = udc.batchRows(nil, nil, rows)
|
||||
if err == nil {
|
||||
udc.docCount -= 1
|
||||
}
|
||||
return err
|
||||
}
|
||||
|
||||
func (udc *UpsideDownCouch) backIndexRowForDoc(docId string) (*BackIndexRow, error) {
|
||||
ro := defaultReadOptions()
|
||||
|
||||
// use a temporary row structure to build key
|
||||
tempRow := &BackIndexRow{
|
||||
doc: []byte(docId),
|
||||
}
|
||||
key := tempRow.Key()
|
||||
value, err := udc.db.Get(ro, key)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if value == nil {
|
||||
return nil, nil
|
||||
}
|
||||
backIndexRow := ParseFromKeyValue(key, value).(*BackIndexRow)
|
||||
return backIndexRow, nil
|
||||
}
|
||||
|
||||
func (udc *UpsideDownCouch) Dump() {
|
||||
ro := defaultReadOptions()
|
||||
ro.SetFillCache(false)
|
||||
it := udc.db.NewIterator(ro)
|
||||
defer it.Close()
|
||||
it.SeekToFirst()
|
||||
for it = it; it.Valid(); it.Next() {
|
||||
//fmt.Printf("Key: `%v` Value: `%v`\n", string(it.Key()), string(it.Value()))
|
||||
row := ParseFromKeyValue(it.Key(), it.Value())
|
||||
if row != nil {
|
||||
fmt.Printf("%v\n", row)
|
||||
fmt.Printf("Key: % -100x\nValue: % -100x\n\n", it.Key(), it.Value())
|
||||
}
|
||||
}
|
||||
err := it.GetError()
|
||||
if err != nil {
|
||||
fmt.Printf("Error reading iterator: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func (udc *UpsideDownCouch) TermFieldReader(term []byte, fieldName string) (index.TermFieldReader, error) {
|
||||
fieldIndex, fieldExists := udc.fieldIndexes[fieldName]
|
||||
if fieldExists {
|
||||
return newUpsideDownCouchTermFieldReader(udc, term, uint16(fieldIndex))
|
||||
}
|
||||
log.Printf("fields: %v", udc.fieldIndexes)
|
||||
return nil, fmt.Errorf("No field named `%s` in the schema", fieldName)
|
||||
}
|
||||
|
||||
func defaultWriteOptions() *levigo.WriteOptions {
|
||||
wo := levigo.NewWriteOptions()
|
||||
// request fsync on write for safety
|
||||
wo.SetSync(true)
|
||||
return wo
|
||||
}
|
||||
|
||||
func defaultReadOptions() *levigo.ReadOptions {
|
||||
ro := levigo.NewReadOptions()
|
||||
return ro
|
||||
}
|
||||
|
||||
func frequencyFromTokenFreq(tf *analysis.TokenFreq) int {
|
||||
return len(tf.Locations)
|
||||
}
|
||||
|
||||
func termVectorsFromTokenFreq(field uint16, tf *analysis.TokenFreq) []*TermVector {
|
||||
rv := make([]*TermVector, len(tf.Locations))
|
||||
|
||||
for i, l := range tf.Locations {
|
||||
tv := TermVector{
|
||||
field: field,
|
||||
pos: uint64(l.Position),
|
||||
start: uint64(l.Start),
|
||||
end: uint64(l.End),
|
||||
}
|
||||
rv[i] = &tv
|
||||
}
|
||||
|
||||
return rv
|
||||
}
|
||||
|
||||
func (udc *UpsideDownCouch) termFieldVectorsFromTermVectors(in []*TermVector) []*index.TermFieldVector {
|
||||
rv := make([]*index.TermFieldVector, len(in))
|
||||
|
||||
for i, tv := range in {
|
||||
fieldName := udc.fieldIndexToName(tv.field)
|
||||
tfv := index.TermFieldVector{
|
||||
Field: fieldName,
|
||||
Pos: tv.pos,
|
||||
Start: tv.start,
|
||||
End: tv.end,
|
||||
}
|
||||
rv[i] = &tfv
|
||||
}
|
||||
return rv
|
||||
}
|
||||
|
||||
func (udc *UpsideDownCouch) fieldIndexToName(i uint16) string {
|
||||
for fieldName, fieldIndex := range udc.fieldIndexes {
|
||||
if i == fieldIndex {
|
||||
return fieldName
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
|
@ -0,0 +1,221 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
package upside_down
|
||||
|
||||
import (
|
||||
"os"
|
||||
"testing"
|
||||
|
||||
_ "github.com/couchbaselabs/bleve/analysis/analyzers/standard_analyzer"
|
||||
"github.com/couchbaselabs/bleve/document"
|
||||
)
|
||||
|
||||
func TestIndexOpenReopen(t *testing.T) {
|
||||
defer os.RemoveAll("test")
|
||||
|
||||
idx := NewUpsideDownCouch("test")
|
||||
err := idx.Open()
|
||||
if err != nil {
|
||||
t.Errorf("error opening index: %v", err)
|
||||
}
|
||||
|
||||
var expectedCount uint64 = 0
|
||||
docCount := idx.DocCount()
|
||||
if docCount != expectedCount {
|
||||
t.Errorf("Expected document count to be %d got %d", expectedCount, docCount)
|
||||
}
|
||||
|
||||
// opening database should have inserted version
|
||||
expectedLength := uint64(1)
|
||||
rowCount := idx.rowCount()
|
||||
if rowCount != expectedLength {
|
||||
t.Errorf("expected %d rows, got: %d", expectedLength, rowCount)
|
||||
}
|
||||
|
||||
// now close it
|
||||
idx.Close()
|
||||
|
||||
idx = NewUpsideDownCouch("test")
|
||||
err = idx.Open()
|
||||
if err != nil {
|
||||
t.Errorf("error opening index: %v", err)
|
||||
}
|
||||
|
||||
// now close it
|
||||
idx.Close()
|
||||
}
|
||||
|
||||
func TestIndexInsert(t *testing.T) {
|
||||
defer os.RemoveAll("test")
|
||||
|
||||
idx := NewUpsideDownCouch("test")
|
||||
|
||||
err := idx.Open()
|
||||
if err != nil {
|
||||
t.Errorf("error opening index: %v", err)
|
||||
}
|
||||
defer idx.Close()
|
||||
|
||||
var expectedCount uint64 = 0
|
||||
docCount := idx.DocCount()
|
||||
if docCount != expectedCount {
|
||||
t.Errorf("Expected document count to be %d got %d", expectedCount, docCount)
|
||||
}
|
||||
|
||||
doc := document.NewDocument("1")
|
||||
doc.AddField(document.NewTextField("name", []byte("test")))
|
||||
err = idx.Update(doc)
|
||||
if err != nil {
|
||||
t.Errorf("Error updating index: %v", err)
|
||||
}
|
||||
expectedCount += 1
|
||||
|
||||
docCount = idx.DocCount()
|
||||
if docCount != expectedCount {
|
||||
t.Errorf("Expected document count to be %d got %d", expectedCount, docCount)
|
||||
}
|
||||
|
||||
// should have 4 rows (1 for version, 1 for schema field, and 1 for single term, and 1 for the term count, and 1 for the back index entry)
|
||||
expectedLength := uint64(1 + 1 + 1 + 1 + 1)
|
||||
rowCount := idx.rowCount()
|
||||
if rowCount != expectedLength {
|
||||
t.Errorf("expected %d rows, got: %d", expectedLength, rowCount)
|
||||
}
|
||||
}
|
||||
|
||||
func TestIndexInsertThenDelete(t *testing.T) {
|
||||
defer os.RemoveAll("test")
|
||||
|
||||
idx := NewUpsideDownCouch("test")
|
||||
|
||||
err := idx.Open()
|
||||
if err != nil {
|
||||
t.Errorf("error opening index: %v", err)
|
||||
}
|
||||
defer idx.Close()
|
||||
|
||||
var expectedCount uint64 = 0
|
||||
docCount := idx.DocCount()
|
||||
if docCount != expectedCount {
|
||||
t.Errorf("Expected document count to be %d got %d", expectedCount, docCount)
|
||||
}
|
||||
|
||||
doc := document.NewDocument("1")
|
||||
doc.AddField(document.NewTextField("name", []byte("test")))
|
||||
err = idx.Update(doc)
|
||||
if err != nil {
|
||||
t.Errorf("Error updating index: %v", err)
|
||||
}
|
||||
expectedCount += 1
|
||||
|
||||
docCount = idx.DocCount()
|
||||
if docCount != expectedCount {
|
||||
t.Errorf("Expected document count to be %d got %d", expectedCount, docCount)
|
||||
}
|
||||
|
||||
err = idx.Delete("1")
|
||||
if err != nil {
|
||||
t.Errorf("Error deleting entry from index: %v", err)
|
||||
}
|
||||
expectedCount -= 1
|
||||
|
||||
docCount = idx.DocCount()
|
||||
if docCount != expectedCount {
|
||||
t.Errorf("Expected document count to be %d got %d", expectedCount, docCount)
|
||||
}
|
||||
|
||||
// should have 2 row (1 for version, 1 for schema field)
|
||||
expectedLength := uint64(1 + 1)
|
||||
rowCount := idx.rowCount()
|
||||
if rowCount != expectedLength {
|
||||
t.Errorf("expected %d rows, got: %d", expectedLength, rowCount)
|
||||
}
|
||||
}
|
||||
|
||||
func TestIndexInsertThenUpdate(t *testing.T) {
|
||||
defer os.RemoveAll("test")
|
||||
|
||||
idx := NewUpsideDownCouch("test")
|
||||
|
||||
err := idx.Open()
|
||||
if err != nil {
|
||||
t.Errorf("error opening index: %v", err)
|
||||
}
|
||||
defer idx.Close()
|
||||
|
||||
doc := document.NewDocument("1")
|
||||
doc.AddField(document.NewTextField("name", []byte("test")))
|
||||
err = idx.Update(doc)
|
||||
if err != nil {
|
||||
t.Errorf("Error updating index: %v", err)
|
||||
}
|
||||
|
||||
// this update should overwrite one term, and introduce one new one
|
||||
doc = document.NewDocument("1")
|
||||
doc.AddField(document.NewTextField("name", []byte("test fail")))
|
||||
err = idx.Update(doc)
|
||||
if err != nil {
|
||||
t.Errorf("Error deleting entry from index: %v", err)
|
||||
}
|
||||
|
||||
// should have 2 row (1 for version, 1 for schema field, and 2 for the two term, and 2 for the term counts, and 1 for the back index entry)
|
||||
expectedLength := uint64(1 + 1 + 2 + 2 + 1)
|
||||
rowCount := idx.rowCount()
|
||||
if rowCount != expectedLength {
|
||||
t.Errorf("expected %d rows, got: %d", expectedLength, rowCount)
|
||||
}
|
||||
|
||||
// now do another update that should remove one of term
|
||||
doc = document.NewDocument("1")
|
||||
doc.AddField(document.NewTextField("name", []byte("fail")))
|
||||
err = idx.Update(doc)
|
||||
if err != nil {
|
||||
t.Errorf("Error deleting entry from index: %v", err)
|
||||
}
|
||||
|
||||
// should have 2 row (1 for version, 1 for schema field, and 1 for the remaining term, and 1 for the term count, and 1 for the back index entry)
|
||||
expectedLength = uint64(1 + 1 + 1 + 1 + 1)
|
||||
rowCount = idx.rowCount()
|
||||
if rowCount != expectedLength {
|
||||
t.Errorf("expected %d rows, got: %d", expectedLength, rowCount)
|
||||
}
|
||||
}
|
||||
|
||||
func TestIndexInsertMultiple(t *testing.T) {
|
||||
defer os.RemoveAll("test")
|
||||
|
||||
idx := NewUpsideDownCouch("test")
|
||||
|
||||
err := idx.Open()
|
||||
if err != nil {
|
||||
t.Errorf("error opening index: %v", err)
|
||||
}
|
||||
defer idx.Close()
|
||||
|
||||
doc := document.NewDocument("1")
|
||||
doc.AddField(document.NewTextField("name", []byte("test")))
|
||||
err = idx.Update(doc)
|
||||
if err != nil {
|
||||
t.Errorf("Error updating index: %v", err)
|
||||
}
|
||||
|
||||
doc = document.NewDocument("2")
|
||||
doc.AddField(document.NewTextField("name", []byte("test")))
|
||||
err = idx.Update(doc)
|
||||
if err != nil {
|
||||
t.Errorf("Error updating index: %v", err)
|
||||
}
|
||||
|
||||
// should have 4 rows (1 for version, 1 for schema field, and 2 for single term, and 1 for the term count, and 2 for the back index entries)
|
||||
expectedLength := uint64(1 + 1 + 2 + 1 + 2)
|
||||
rowCount := idx.rowCount()
|
||||
if rowCount != expectedLength {
|
||||
t.Errorf("expected %d rows, got: %d", expectedLength, rowCount)
|
||||
}
|
||||
}
|
|
@ -0,0 +1,21 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
package search
|
||||
|
||||
import (
|
||||
"time"
|
||||
)
|
||||
|
||||
type Collector interface {
|
||||
Collect(searcher Searcher) error
|
||||
Results() DocumentMatchCollection
|
||||
Total() uint64
|
||||
MaxScore() float64
|
||||
Took() time.Duration
|
||||
}
|
|
@ -0,0 +1,96 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
package search
|
||||
|
||||
import (
|
||||
"container/list"
|
||||
"time"
|
||||
)
|
||||
|
||||
type TopScoreCollector struct {
|
||||
k int
|
||||
results *list.List
|
||||
took time.Duration
|
||||
maxScore float64
|
||||
total uint64
|
||||
}
|
||||
|
||||
func NewTopScorerCollector(k int) *TopScoreCollector {
|
||||
return &TopScoreCollector{
|
||||
k: k,
|
||||
results: list.New(),
|
||||
}
|
||||
}
|
||||
|
||||
func (tksc *TopScoreCollector) Total() uint64 {
|
||||
return tksc.total
|
||||
}
|
||||
|
||||
func (tksc *TopScoreCollector) MaxScore() float64 {
|
||||
return tksc.maxScore
|
||||
}
|
||||
|
||||
func (tksc *TopScoreCollector) Took() time.Duration {
|
||||
return tksc.took
|
||||
}
|
||||
|
||||
func (tksc *TopScoreCollector) Collect(searcher Searcher) error {
|
||||
startTime := time.Now()
|
||||
next, err := searcher.Next()
|
||||
for err == nil && next != nil {
|
||||
tksc.collectSingle(next)
|
||||
next, err = searcher.Next()
|
||||
}
|
||||
// compute search duration
|
||||
tksc.took = time.Since(startTime)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (tksc *TopScoreCollector) collectSingle(dm *DocumentMatch) {
|
||||
// increment total hits
|
||||
tksc.total += 1
|
||||
|
||||
// update max score
|
||||
if dm.Score > tksc.maxScore {
|
||||
tksc.maxScore = dm.Score
|
||||
}
|
||||
|
||||
for e := tksc.results.Front(); e != nil; e = e.Next() {
|
||||
curr := e.Value.(*DocumentMatch)
|
||||
if dm.Score < curr.Score {
|
||||
|
||||
tksc.results.InsertBefore(dm, e)
|
||||
// if we just made the list too long
|
||||
if tksc.results.Len() > tksc.k {
|
||||
// remove the head
|
||||
tksc.results.Remove(tksc.results.Front())
|
||||
}
|
||||
return
|
||||
}
|
||||
}
|
||||
// if we got to the end, we still have to add it
|
||||
tksc.results.PushBack(dm)
|
||||
if tksc.results.Len() > tksc.k {
|
||||
// remove the head
|
||||
tksc.results.Remove(tksc.results.Front())
|
||||
}
|
||||
}
|
||||
|
||||
func (tksc *TopScoreCollector) Results() DocumentMatchCollection {
|
||||
rv := make(DocumentMatchCollection, tksc.results.Len())
|
||||
i := 0
|
||||
for e := tksc.results.Back(); e != nil; e = e.Prev() {
|
||||
rv[i] = e.Value.(*DocumentMatch)
|
||||
i++
|
||||
}
|
||||
return rv
|
||||
}
|
|
@ -0,0 +1,107 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
package search
|
||||
|
||||
import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestTop10Scores(t *testing.T) {
|
||||
|
||||
// a stub search with more than 10 matches
|
||||
// the top-10 scores are > 10
|
||||
// everything else is less than 10
|
||||
searcher := &stubSearcher{
|
||||
matches: DocumentMatchCollection{
|
||||
&DocumentMatch{
|
||||
ID: "a",
|
||||
Score: 11,
|
||||
},
|
||||
&DocumentMatch{
|
||||
ID: "b",
|
||||
Score: 9,
|
||||
},
|
||||
&DocumentMatch{
|
||||
ID: "c",
|
||||
Score: 11,
|
||||
},
|
||||
&DocumentMatch{
|
||||
ID: "d",
|
||||
Score: 9,
|
||||
},
|
||||
&DocumentMatch{
|
||||
ID: "e",
|
||||
Score: 11,
|
||||
},
|
||||
&DocumentMatch{
|
||||
ID: "f",
|
||||
Score: 9,
|
||||
},
|
||||
&DocumentMatch{
|
||||
ID: "g",
|
||||
Score: 11,
|
||||
},
|
||||
&DocumentMatch{
|
||||
ID: "h",
|
||||
Score: 9,
|
||||
},
|
||||
&DocumentMatch{
|
||||
ID: "i",
|
||||
Score: 11,
|
||||
},
|
||||
&DocumentMatch{
|
||||
ID: "j",
|
||||
Score: 11,
|
||||
},
|
||||
&DocumentMatch{
|
||||
ID: "k",
|
||||
Score: 11,
|
||||
},
|
||||
&DocumentMatch{
|
||||
ID: "l",
|
||||
Score: 99,
|
||||
},
|
||||
&DocumentMatch{
|
||||
ID: "m",
|
||||
Score: 11,
|
||||
},
|
||||
&DocumentMatch{
|
||||
ID: "n",
|
||||
Score: 11,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
collector := NewTopScorerCollector(10)
|
||||
collector.Collect(searcher)
|
||||
results := collector.Results()
|
||||
|
||||
if len(results) != 10 {
|
||||
t.Fatalf("expected 10 results, got %d", len(results))
|
||||
}
|
||||
|
||||
if results[0].ID != "l" {
|
||||
t.Errorf("expected first result to have ID 'l', got %s", results[0].ID)
|
||||
}
|
||||
|
||||
if results[0].Score != 99.0 {
|
||||
t.Errorf("expected highest score to be 99.0, got %f", results[0].Score)
|
||||
}
|
||||
|
||||
minScore := 1000.0
|
||||
for _, result := range results {
|
||||
if result.Score < minScore {
|
||||
minScore = result.Score
|
||||
}
|
||||
}
|
||||
|
||||
if minScore < 10 {
|
||||
t.Errorf("expected minimum score to be higher than 10, got %f", minScore)
|
||||
}
|
||||
}
|
|
@ -0,0 +1,28 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
package search
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
)
|
||||
|
||||
type Explanation struct {
|
||||
Value float64 `json:"value"`
|
||||
Message string `json:"message"`
|
||||
Children []*Explanation `json:"children,omitempty"`
|
||||
}
|
||||
|
||||
func (expl *Explanation) String() string {
|
||||
js, err := json.MarshalIndent(expl, "", " ")
|
||||
if err != nil {
|
||||
return fmt.Sprintf("error serializing explation to json: %v", err)
|
||||
}
|
||||
return string(js)
|
||||
}
|
|
@ -0,0 +1,19 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
package search
|
||||
|
||||
import (
|
||||
"github.com/couchbaselabs/bleve/index"
|
||||
)
|
||||
|
||||
type Query interface {
|
||||
Boost() float64
|
||||
Searcher(index index.Index) (Searcher, error)
|
||||
Validate() error
|
||||
}
|
|
@ -0,0 +1,32 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
package search
|
||||
|
||||
import (
|
||||
"github.com/couchbaselabs/bleve/index"
|
||||
)
|
||||
|
||||
type TermQuery struct {
|
||||
Term string `json:"term"`
|
||||
Field string `json:"field,omitempty"`
|
||||
BoostVal float64 `json:"boost,omitempty"`
|
||||
Explain bool `json:"explain,omitempty"`
|
||||
}
|
||||
|
||||
func (q *TermQuery) Boost() float64 {
|
||||
return q.BoostVal
|
||||
}
|
||||
|
||||
func (q *TermQuery) Searcher(index index.Index) (Searcher, error) {
|
||||
return NewTermSearcher(index, q)
|
||||
}
|
||||
|
||||
func (q *TermQuery) Validate() error {
|
||||
return nil
|
||||
}
|
|
@ -0,0 +1,172 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
package search
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"math"
|
||||
|
||||
"github.com/couchbaselabs/bleve/index"
|
||||
)
|
||||
|
||||
const MAX_SCORE_CACHE = 64
|
||||
|
||||
type TermQueryScorer struct {
|
||||
query *TermQuery
|
||||
docTerm uint64
|
||||
docTotal uint64
|
||||
idf float64
|
||||
explain bool
|
||||
idfExplanation *Explanation
|
||||
scoreCache map[int]float64
|
||||
scoreExplanationCache map[int]*Explanation
|
||||
queryNorm float64
|
||||
queryWeight float64
|
||||
queryWeightExplanation *Explanation
|
||||
}
|
||||
|
||||
func NewTermQueryScorer(query *TermQuery, docTotal, docTerm uint64, explain bool) *TermQueryScorer {
|
||||
rv := TermQueryScorer{
|
||||
query: query,
|
||||
docTerm: docTerm,
|
||||
docTotal: docTotal,
|
||||
idf: 1.0 + math.Log(float64(docTotal)/float64(docTerm+1.0)),
|
||||
explain: explain,
|
||||
scoreCache: make(map[int]float64, MAX_SCORE_CACHE),
|
||||
scoreExplanationCache: make(map[int]*Explanation, MAX_SCORE_CACHE),
|
||||
queryWeight: 1.0,
|
||||
}
|
||||
|
||||
if explain {
|
||||
rv.idfExplanation = &Explanation{
|
||||
Value: rv.idf,
|
||||
Message: fmt.Sprintf("idf(docFreq=%d, maxDocs=%d)", docTerm, docTotal),
|
||||
}
|
||||
}
|
||||
|
||||
return &rv
|
||||
}
|
||||
|
||||
func (s *TermQueryScorer) Weight() float64 {
|
||||
sum := s.query.Boost() * s.idf
|
||||
return sum * sum
|
||||
}
|
||||
|
||||
func (s *TermQueryScorer) SetQueryNorm(qnorm float64) {
|
||||
s.queryNorm = qnorm
|
||||
|
||||
// update the query weight
|
||||
s.queryWeight = s.query.Boost() * s.idf * s.queryNorm
|
||||
|
||||
if s.explain {
|
||||
childrenExplanations := make([]*Explanation, 3)
|
||||
childrenExplanations[0] = &Explanation{
|
||||
Value: s.query.Boost(),
|
||||
Message: "boost",
|
||||
}
|
||||
childrenExplanations[1] = s.idfExplanation
|
||||
childrenExplanations[2] = &Explanation{
|
||||
Value: s.queryNorm,
|
||||
Message: "queryNorm",
|
||||
}
|
||||
s.queryWeightExplanation = &Explanation{
|
||||
Value: s.queryWeight,
|
||||
Message: fmt.Sprintf("queryWeight(%s:%s^%f), product of:", s.query.Field, string(s.query.Term), s.query.Boost()),
|
||||
Children: childrenExplanations,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (s *TermQueryScorer) Score(termMatch *index.TermFieldDoc) *DocumentMatch {
|
||||
|
||||
var scoreExplanation *Explanation
|
||||
// see if the score was cached
|
||||
score, ok := s.scoreCache[int(termMatch.Freq)]
|
||||
if !ok {
|
||||
// need to compute score
|
||||
var tf float64
|
||||
if termMatch.Freq < MAX_SQRT_CACHE {
|
||||
tf = SQRT_CACHE[int(termMatch.Freq)]
|
||||
} else {
|
||||
tf = math.Sqrt(float64(termMatch.Freq))
|
||||
}
|
||||
|
||||
score = tf * termMatch.Norm * s.idf
|
||||
|
||||
if s.explain {
|
||||
childrenExplanations := make([]*Explanation, 3)
|
||||
childrenExplanations[0] = &Explanation{
|
||||
Value: tf,
|
||||
Message: fmt.Sprintf("tf(termFreq(%s:%s)=%d", s.query.Field, string(s.query.Term), termMatch.Freq),
|
||||
}
|
||||
childrenExplanations[1] = &Explanation{
|
||||
Value: termMatch.Norm,
|
||||
Message: fmt.Sprintf("fieldNorm(field=%s, doc=%s)", s.query.Field, termMatch.ID),
|
||||
}
|
||||
childrenExplanations[2] = s.idfExplanation
|
||||
scoreExplanation = &Explanation{
|
||||
Value: score,
|
||||
Message: fmt.Sprintf("fieldWeight(%s:%s in %s), product of:", s.query.Field, string(s.query.Term), termMatch.ID),
|
||||
Children: childrenExplanations,
|
||||
}
|
||||
}
|
||||
|
||||
// if the query weight isn't 1, multiply
|
||||
if s.queryWeight != 1.0 {
|
||||
score = score * s.queryWeight
|
||||
if s.explain {
|
||||
childExplanations := make([]*Explanation, 2)
|
||||
childExplanations[0] = s.queryWeightExplanation
|
||||
childExplanations[1] = scoreExplanation
|
||||
scoreExplanation = &Explanation{
|
||||
Value: score,
|
||||
Message: fmt.Sprintf("weight(%s:%s^%f in %s), product of:", s.query.Field, string(s.query.Term), s.query.Boost(), termMatch.ID),
|
||||
Children: childExplanations,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if termMatch.Freq < MAX_SCORE_CACHE {
|
||||
s.scoreCache[int(termMatch.Freq)] = score
|
||||
if s.explain {
|
||||
s.scoreExplanationCache[int(termMatch.Freq)] = scoreExplanation
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ok && s.explain {
|
||||
scoreExplanation = s.scoreExplanationCache[int(termMatch.Freq)]
|
||||
}
|
||||
|
||||
rv := DocumentMatch{
|
||||
ID: termMatch.ID,
|
||||
Score: score,
|
||||
}
|
||||
if s.explain {
|
||||
rv.Expl = scoreExplanation
|
||||
}
|
||||
|
||||
if termMatch.Vectors != nil && len(termMatch.Vectors) > 0 {
|
||||
locations := make(Locations, len(termMatch.Vectors))
|
||||
for i, v := range termMatch.Vectors {
|
||||
loc := Location{
|
||||
Pos: float64(v.Pos),
|
||||
Start: float64(v.Start),
|
||||
End: float64(v.End),
|
||||
}
|
||||
locations[i] = &loc
|
||||
}
|
||||
tlm := make(TermLocationMap)
|
||||
tlm[s.query.Term] = locations
|
||||
rv.Locations = make(FieldTermLocationMap)
|
||||
rv.Locations[s.query.Field] = tlm
|
||||
}
|
||||
|
||||
return &rv
|
||||
}
|
|
@ -0,0 +1,39 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
package search
|
||||
|
||||
type Location struct {
|
||||
Pos float64 `json:"pos"`
|
||||
Start float64 `json:"start"`
|
||||
End float64 `json:"end"`
|
||||
}
|
||||
|
||||
type Locations []*Location
|
||||
|
||||
type TermLocationMap map[string]Locations
|
||||
|
||||
type FieldTermLocationMap map[string]TermLocationMap
|
||||
|
||||
type DocumentMatch struct {
|
||||
ID string `json:"id"`
|
||||
Score float64 `json:"score"`
|
||||
Expl *Explanation `json:"explanation,omitempty"`
|
||||
Locations FieldTermLocationMap `json:"locations,omitempty"`
|
||||
}
|
||||
|
||||
type DocumentMatchCollection []*DocumentMatch
|
||||
|
||||
type Searcher interface {
|
||||
Next() (*DocumentMatch, error)
|
||||
Advance(ID string) (*DocumentMatch, error)
|
||||
Close()
|
||||
Weight() float64
|
||||
SetQueryNorm(float64)
|
||||
Count() uint64
|
||||
}
|
|
@ -0,0 +1,84 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
package search
|
||||
|
||||
import (
|
||||
"github.com/couchbaselabs/bleve/index"
|
||||
)
|
||||
|
||||
type TermSearcher struct {
|
||||
index index.Index
|
||||
query *TermQuery
|
||||
reader index.TermFieldReader
|
||||
scorer *TermQueryScorer
|
||||
}
|
||||
|
||||
func NewTermSearcher(index index.Index, query *TermQuery) (*TermSearcher, error) {
|
||||
reader, err := index.TermFieldReader([]byte(query.Term), query.Field)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
scorer := NewTermQueryScorer(query, index.DocCount(), reader.Count(), query.Explain)
|
||||
return &TermSearcher{
|
||||
index: index,
|
||||
query: query,
|
||||
reader: reader,
|
||||
scorer: scorer,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (s *TermSearcher) Count() uint64 {
|
||||
return s.reader.Count()
|
||||
}
|
||||
|
||||
func (s *TermSearcher) Weight() float64 {
|
||||
return s.scorer.Weight()
|
||||
}
|
||||
|
||||
func (s *TermSearcher) SetQueryNorm(qnorm float64) {
|
||||
s.scorer.SetQueryNorm(qnorm)
|
||||
}
|
||||
|
||||
func (s *TermSearcher) Next() (*DocumentMatch, error) {
|
||||
termMatch, err := s.reader.Next()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if termMatch == nil {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
// score match
|
||||
docMatch := s.scorer.Score(termMatch)
|
||||
// return doc match
|
||||
return docMatch, nil
|
||||
|
||||
}
|
||||
|
||||
func (s *TermSearcher) Advance(ID string) (*DocumentMatch, error) {
|
||||
termMatch, err := s.reader.Advance(ID)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if termMatch == nil {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
// score match
|
||||
docMatch := s.scorer.Score(termMatch)
|
||||
|
||||
// return doc match
|
||||
return docMatch, nil
|
||||
}
|
||||
|
||||
func (s *TermSearcher) Close() {
|
||||
s.reader.Close()
|
||||
}
|
|
@ -0,0 +1,50 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
package search
|
||||
|
||||
type stubSearcher struct {
|
||||
index int
|
||||
matches DocumentMatchCollection
|
||||
}
|
||||
|
||||
func (ss *stubSearcher) Next() (*DocumentMatch, error) {
|
||||
if ss.index < len(ss.matches) {
|
||||
rv := ss.matches[ss.index]
|
||||
ss.index++
|
||||
return rv, nil
|
||||
}
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (ss *stubSearcher) Advance(ID string) (*DocumentMatch, error) {
|
||||
|
||||
for ss.index < len(ss.matches) && ss.matches[ss.index].ID < ID {
|
||||
ss.index++
|
||||
}
|
||||
if ss.index < len(ss.matches) {
|
||||
rv := ss.matches[ss.index]
|
||||
ss.index++
|
||||
return rv, nil
|
||||
}
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (ss *stubSearcher) Close() {
|
||||
}
|
||||
|
||||
func (ss *stubSearcher) Weight() float64 {
|
||||
return 0.0
|
||||
}
|
||||
|
||||
func (ss *stubSearcher) SetQueryNorm(float64) {
|
||||
}
|
||||
|
||||
func (ss *stubSearcher) Count() uint64 {
|
||||
return uint64(len(ss.matches))
|
||||
}
|
|
@ -0,0 +1,24 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
package search
|
||||
|
||||
import (
|
||||
"math"
|
||||
)
|
||||
|
||||
var SQRT_CACHE map[int]float64
|
||||
|
||||
const MAX_SQRT_CACHE = 64
|
||||
|
||||
func init() {
|
||||
SQRT_CACHE = make(map[int]float64, MAX_SQRT_CACHE)
|
||||
for i := 0; i < MAX_SQRT_CACHE; i++ {
|
||||
SQRT_CACHE[i] = math.Sqrt(float64(i))
|
||||
}
|
||||
}
|
|
@ -0,0 +1,64 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package shredder
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"strconv"
|
||||
|
||||
"github.com/couchbaselabs/bleve/document"
|
||||
)
|
||||
|
||||
// A simple automatic JSON shredder which parses the whole document body.
|
||||
// Any strings found in the JSON are added as text fields
|
||||
|
||||
type AutoJsonShredder struct {
|
||||
}
|
||||
|
||||
func NewAutoJsonShredder() *AutoJsonShredder {
|
||||
return &AutoJsonShredder{}
|
||||
}
|
||||
|
||||
func (s *AutoJsonShredder) Shred(id string, body []byte) (*document.Document, error) {
|
||||
rv := document.NewDocument(id)
|
||||
|
||||
var section interface{}
|
||||
err := json.Unmarshal(body, §ion)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
shredSection(rv, section, "")
|
||||
|
||||
return rv, nil
|
||||
}
|
||||
|
||||
func shredSection(doc *document.Document, section interface{}, parent string) {
|
||||
nextParent := parent
|
||||
if nextParent != "" {
|
||||
nextParent = nextParent + "."
|
||||
}
|
||||
switch section := section.(type) {
|
||||
|
||||
case string:
|
||||
f := document.NewTextField(parent, []byte(section))
|
||||
doc.AddField(f)
|
||||
|
||||
case []interface{}:
|
||||
for i, sub := range section {
|
||||
shredSection(doc, sub, nextParent+strconv.Itoa(i))
|
||||
}
|
||||
|
||||
case map[string]interface{}:
|
||||
for k, sub := range section {
|
||||
shredSection(doc, sub, nextParent+k)
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,55 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
package shredder
|
||||
|
||||
import (
|
||||
"github.com/couchbaselabs/bleve/document"
|
||||
"github.com/dustin/go-jsonpointer"
|
||||
)
|
||||
|
||||
// A simple automatic JSON shredder which parses the whole document body.
|
||||
// Any strings found in the JSON are added as text fields
|
||||
|
||||
type JsonPointerShredder struct {
|
||||
fieldPaths map[string]string
|
||||
paths []string
|
||||
}
|
||||
|
||||
func NewJsonPointerShredder() *JsonPointerShredder {
|
||||
return &JsonPointerShredder{
|
||||
fieldPaths: make(map[string]string),
|
||||
paths: make([]string, 0),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *JsonPointerShredder) AddTextField(name string, path string) {
|
||||
s.fieldPaths[name] = path
|
||||
s.paths = append(s.paths, path)
|
||||
}
|
||||
|
||||
func (s *JsonPointerShredder) AddField(name string, path string) {
|
||||
s.fieldPaths[name] = path
|
||||
s.paths = append(s.paths, path)
|
||||
}
|
||||
|
||||
func (s *JsonPointerShredder) Shred(id string, body []byte) (*document.Document, error) {
|
||||
rv := document.NewDocument(id)
|
||||
|
||||
values, err := jsonpointer.FindMany(body, s.paths)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
for fieldName, fieldPath := range s.fieldPaths {
|
||||
field := document.NewTextField(fieldName, values[fieldPath])
|
||||
rv.AddField(field)
|
||||
}
|
||||
|
||||
return rv, nil
|
||||
}
|
|
@ -0,0 +1,17 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
package shredder
|
||||
|
||||
import (
|
||||
"github.com/couchbaselabs/bleve/document"
|
||||
)
|
||||
|
||||
type Shredder interface {
|
||||
Shred(id string, body []byte) (document.Document, error)
|
||||
}
|
|
@ -0,0 +1,31 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
package main
|
||||
|
||||
import (
|
||||
"flag"
|
||||
"log"
|
||||
|
||||
"github.com/couchbaselabs/bleve/index/upside_down"
|
||||
)
|
||||
|
||||
var indexDir = flag.String("indexDir", "index", "index directory")
|
||||
|
||||
func main() {
|
||||
flag.Parse()
|
||||
|
||||
index := upside_down.NewUpsideDownCouch(*indexDir)
|
||||
err := index.Open()
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
defer index.Close()
|
||||
|
||||
index.Dump()
|
||||
}
|
Loading…
Reference in New Issue