initial commit
This commit is contained in:
commit
3d842dfaf2
|
@ -0,0 +1,10 @@
|
||||||
|
#*
|
||||||
|
*.sublime-*
|
||||||
|
*~
|
||||||
|
.#*
|
||||||
|
.project
|
||||||
|
.settings
|
||||||
|
.DS_Store
|
||||||
|
/examples/bleve_index_json/bleve_index_json
|
||||||
|
/examples/bleve_query/bleve_query
|
||||||
|
/utils/bleve_dump/bleve_dump
|
|
@ -0,0 +1,24 @@
|
||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
package keyword_analyzer
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/couchbaselabs/bleve/analysis"
|
||||||
|
"github.com/couchbaselabs/bleve/analysis/tokenizers/single_token"
|
||||||
|
)
|
||||||
|
|
||||||
|
func NewKeywordAnalyzer() (*analysis.Analyzer, error) {
|
||||||
|
keyword := analysis.Analyzer{
|
||||||
|
CharFilters: []analysis.CharFilter{},
|
||||||
|
Tokenizer: single_token.NewSingleTokenTokenizer(),
|
||||||
|
Filters: []analysis.TokenFilter{},
|
||||||
|
}
|
||||||
|
|
||||||
|
return &keyword, nil
|
||||||
|
}
|
|
@ -0,0 +1,39 @@
|
||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
package standard_analyzer
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/couchbaselabs/bleve/analysis"
|
||||||
|
"github.com/couchbaselabs/bleve/analysis/token_filters/lower_case_filter"
|
||||||
|
"github.com/couchbaselabs/bleve/analysis/token_filters/stop_words_filter"
|
||||||
|
"github.com/couchbaselabs/bleve/analysis/tokenizers/unicode_word_boundary"
|
||||||
|
)
|
||||||
|
|
||||||
|
func NewStandardAnalyzer() (*analysis.Analyzer, error) {
|
||||||
|
lower_case_filter, err := lower_case_filter.NewLowerCaseFilter()
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
stop_words_filter, err := stop_words_filter.NewStopWordsFilter()
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
standard := analysis.Analyzer{
|
||||||
|
CharFilters: []analysis.CharFilter{},
|
||||||
|
Tokenizer: unicode_word_boundary.NewUnicodeWordBoundaryTokenizer(),
|
||||||
|
TokenFilters: []analysis.TokenFilter{
|
||||||
|
lower_case_filter,
|
||||||
|
stop_words_filter,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
return &standard, nil
|
||||||
|
}
|
|
@ -0,0 +1,32 @@
|
||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
package html_char_filter
|
||||||
|
|
||||||
|
import (
|
||||||
|
"regexp"
|
||||||
|
|
||||||
|
"github.com/couchbaselabs/bleve/analysis/char_filters/regexp_char_filter"
|
||||||
|
)
|
||||||
|
|
||||||
|
// the origin of this regex is here:
|
||||||
|
// http://haacked.com/archive/2004/10/25/usingregularexpressionstomatchhtml.aspx/
|
||||||
|
// slightly modified by me to also match the DOCTYPE
|
||||||
|
const htmlTagPattern = `</?[!\w]+((\s+\w+(\s*=\s*(?:".*?"|'.*?'|[^'">\s]+))?)+\s*|\s*)/?>`
|
||||||
|
|
||||||
|
var htmlRegex = regexp.MustCompile(htmlTagPattern)
|
||||||
|
|
||||||
|
type HtmlCharFilter struct {
|
||||||
|
*regexp_char_filter.RegexpCharFilter
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewHtmlCharFilter() *HtmlCharFilter {
|
||||||
|
return &HtmlCharFilter{
|
||||||
|
regexp_char_filter.NewRegexpCharFilter(htmlRegex, []byte{' '}),
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,52 @@
|
||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
package html_char_filter
|
||||||
|
|
||||||
|
import (
|
||||||
|
"reflect"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestHtmlCharFilter(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
input []byte
|
||||||
|
output []byte
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
input: []byte(`<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
|
||||||
|
<h1>My First Heading</h1>
|
||||||
|
|
||||||
|
<p>My first paragraph.</p>
|
||||||
|
|
||||||
|
</body>
|
||||||
|
</html>`),
|
||||||
|
output: []byte(`
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
My First Heading
|
||||||
|
|
||||||
|
My first paragraph.
|
||||||
|
|
||||||
|
|
||||||
|
`),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, test := range tests {
|
||||||
|
filter := NewHtmlCharFilter()
|
||||||
|
output := filter.Filter(test.input)
|
||||||
|
if !reflect.DeepEqual(output, test.output) {
|
||||||
|
t.Errorf("Expected:\n`%s`\ngot:\n`%s`\nfor:\n`%s`\n", string(test.output), string(output), string(test.input))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,30 @@
|
||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
package regexp_char_filter
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"regexp"
|
||||||
|
)
|
||||||
|
|
||||||
|
type RegexpCharFilter struct {
|
||||||
|
r *regexp.Regexp
|
||||||
|
replacement []byte
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewRegexpCharFilter(r *regexp.Regexp, replacement []byte) *RegexpCharFilter {
|
||||||
|
return &RegexpCharFilter{
|
||||||
|
r: r,
|
||||||
|
replacement: replacement,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *RegexpCharFilter) Filter(input []byte) []byte {
|
||||||
|
return s.r.ReplaceAllFunc(input, func(in []byte) []byte { return bytes.Repeat(s.replacement, len(in)) })
|
||||||
|
}
|
|
@ -0,0 +1,55 @@
|
||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
package analysis
|
||||||
|
|
||||||
|
type TokenLocation struct {
|
||||||
|
Start int
|
||||||
|
End int
|
||||||
|
Position int
|
||||||
|
}
|
||||||
|
|
||||||
|
type TokenFreq struct {
|
||||||
|
Term []byte
|
||||||
|
Locations []*TokenLocation
|
||||||
|
}
|
||||||
|
|
||||||
|
func TokenFrequency(tokens TokenStream) []*TokenFreq {
|
||||||
|
index := make(map[string]*TokenFreq)
|
||||||
|
|
||||||
|
for _, token := range tokens {
|
||||||
|
curr, ok := index[string(token.Term)]
|
||||||
|
if ok {
|
||||||
|
curr.Locations = append(curr.Locations, &TokenLocation{
|
||||||
|
Start: token.Start,
|
||||||
|
End: token.End,
|
||||||
|
Position: token.Position,
|
||||||
|
})
|
||||||
|
} else {
|
||||||
|
index[string(token.Term)] = &TokenFreq{
|
||||||
|
Term: token.Term,
|
||||||
|
Locations: []*TokenLocation{
|
||||||
|
&TokenLocation{
|
||||||
|
Start: token.Start,
|
||||||
|
End: token.End,
|
||||||
|
Position: token.Position,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
rv := make([]*TokenFreq, len(index))
|
||||||
|
i := 0
|
||||||
|
for _, tf := range index {
|
||||||
|
rv[i] = tf
|
||||||
|
i += 1
|
||||||
|
}
|
||||||
|
|
||||||
|
return rv
|
||||||
|
}
|
|
@ -0,0 +1,44 @@
|
||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
package length_filter
|
||||||
|
|
||||||
|
import (
|
||||||
|
"unicode/utf8"
|
||||||
|
|
||||||
|
"github.com/couchbaselabs/bleve/analysis"
|
||||||
|
)
|
||||||
|
|
||||||
|
type LengthFilter struct {
|
||||||
|
min int
|
||||||
|
max int
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewLengthFilter(min, max int) (*LengthFilter, error) {
|
||||||
|
return &LengthFilter{
|
||||||
|
min: min,
|
||||||
|
max: max,
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f *LengthFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||||
|
rv := make(analysis.TokenStream, 0)
|
||||||
|
|
||||||
|
for _, token := range input {
|
||||||
|
wordLen := utf8.RuneCount(token.Term)
|
||||||
|
if f.min > 0 && f.min > wordLen {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if f.max > 0 && f.max < wordLen {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
rv = append(rv, token)
|
||||||
|
}
|
||||||
|
|
||||||
|
return rv
|
||||||
|
}
|
|
@ -0,0 +1,102 @@
|
||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
package length_filter
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/couchbaselabs/bleve/analysis"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestLengthFilter(t *testing.T) {
|
||||||
|
|
||||||
|
inputTokenStream := analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("1"),
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("two"),
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("three"),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
lengthFilter, err := NewLengthFilter(3, 4)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
ouputTokenStream := lengthFilter.Filter(inputTokenStream)
|
||||||
|
if len(ouputTokenStream) != 1 {
|
||||||
|
t.Fatalf("expected 1 output token")
|
||||||
|
}
|
||||||
|
if string(ouputTokenStream[0].Term) != "two" {
|
||||||
|
t.Errorf("expected term `two`, got `%s`", ouputTokenStream[0].Term)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestLengthFilterNoMax(t *testing.T) {
|
||||||
|
|
||||||
|
inputTokenStream := analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("1"),
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("two"),
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("three"),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
lengthFilter, err := NewLengthFilter(3, -1)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
ouputTokenStream := lengthFilter.Filter(inputTokenStream)
|
||||||
|
if len(ouputTokenStream) != 2 {
|
||||||
|
t.Fatalf("expected 2 output token")
|
||||||
|
}
|
||||||
|
if string(ouputTokenStream[0].Term) != "two" {
|
||||||
|
t.Errorf("expected term `two`, got `%s`", ouputTokenStream[0].Term)
|
||||||
|
}
|
||||||
|
if string(ouputTokenStream[1].Term) != "three" {
|
||||||
|
t.Errorf("expected term `three`, got `%s`", ouputTokenStream[0].Term)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestLengthFilterNoMin(t *testing.T) {
|
||||||
|
|
||||||
|
inputTokenStream := analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("1"),
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("two"),
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("three"),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
lengthFilter, err := NewLengthFilter(-1, 4)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
ouputTokenStream := lengthFilter.Filter(inputTokenStream)
|
||||||
|
if len(ouputTokenStream) != 2 {
|
||||||
|
t.Fatalf("expected 2 output token")
|
||||||
|
}
|
||||||
|
if string(ouputTokenStream[0].Term) != "1" {
|
||||||
|
t.Errorf("expected term `1`, got `%s`", ouputTokenStream[0].Term)
|
||||||
|
}
|
||||||
|
if string(ouputTokenStream[1].Term) != "two" {
|
||||||
|
t.Errorf("expected term `two`, got `%s`", ouputTokenStream[0].Term)
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,35 @@
|
||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
package lower_case_filter
|
||||||
|
|
||||||
|
import (
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"github.com/couchbaselabs/bleve/analysis"
|
||||||
|
)
|
||||||
|
|
||||||
|
type LowerCaseFilter struct {
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewLowerCaseFilter() (*LowerCaseFilter, error) {
|
||||||
|
return &LowerCaseFilter{}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f *LowerCaseFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||||
|
rv := make(analysis.TokenStream, 0)
|
||||||
|
|
||||||
|
for _, token := range input {
|
||||||
|
word := string(token.Term)
|
||||||
|
wordLowerCase := strings.ToLower(word)
|
||||||
|
token.Term = []byte(wordLowerCase)
|
||||||
|
rv = append(rv, token)
|
||||||
|
}
|
||||||
|
|
||||||
|
return rv
|
||||||
|
}
|
|
@ -0,0 +1,52 @@
|
||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
package lower_case_filter
|
||||||
|
|
||||||
|
import (
|
||||||
|
"reflect"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/couchbaselabs/bleve/analysis"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestLowerCaseFilter(t *testing.T) {
|
||||||
|
|
||||||
|
inputTokenStream := analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("ONE"),
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("two"),
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("ThReE"),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
expectedTokenStream := analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("one"),
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("two"),
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("three"),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
filter, err := NewLowerCaseFilter()
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
ouputTokenStream := filter.Filter(inputTokenStream)
|
||||||
|
if !reflect.DeepEqual(ouputTokenStream, expectedTokenStream) {
|
||||||
|
t.Errorf("expected %#v got %#v", expectedTokenStream, ouputTokenStream)
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,46 @@
|
||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
package stemmer_filter
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bitbucket.org/tebeka/snowball"
|
||||||
|
"github.com/couchbaselabs/bleve/analysis"
|
||||||
|
)
|
||||||
|
|
||||||
|
type StemmerFilter struct {
|
||||||
|
lang string
|
||||||
|
stemmer *snowball.Stemmer
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewStemmerFilter(lang string) (*StemmerFilter, error) {
|
||||||
|
stemmer, err := snowball.New(lang)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return &StemmerFilter{
|
||||||
|
lang: lang,
|
||||||
|
stemmer: stemmer,
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *StemmerFilter) List() []string {
|
||||||
|
return snowball.LangList()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *StemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||||
|
rv := make(analysis.TokenStream, 0)
|
||||||
|
|
||||||
|
for _, token := range input {
|
||||||
|
stemmed := s.stemmer.Stem(string(token.Term))
|
||||||
|
token.Term = []byte(stemmed)
|
||||||
|
rv = append(rv, token)
|
||||||
|
}
|
||||||
|
|
||||||
|
return rv
|
||||||
|
}
|
|
@ -0,0 +1,52 @@
|
||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
package stemmer_filter
|
||||||
|
|
||||||
|
import (
|
||||||
|
"reflect"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/couchbaselabs/bleve/analysis"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestStemmerFilter(t *testing.T) {
|
||||||
|
|
||||||
|
inputTokenStream := analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("walking"),
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("talked"),
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("business"),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
expectedTokenStream := analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("walk"),
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("talk"),
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("busi"),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
filter, err := NewStemmerFilter("english")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
ouputTokenStream := filter.Filter(inputTokenStream)
|
||||||
|
if !reflect.DeepEqual(ouputTokenStream, expectedTokenStream) {
|
||||||
|
t.Errorf("expected %#v got %#v", expectedTokenStream, ouputTokenStream)
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,53 @@
|
||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
package stop_words_filter
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/couchbaselabs/bleve/analysis"
|
||||||
|
)
|
||||||
|
|
||||||
|
var DEFAULT_STOP_WORDS []string = []string{
|
||||||
|
"a", "an", "and", "are", "as", "at", "be", "but", "by",
|
||||||
|
"for", "if", "in", "into", "is", "it",
|
||||||
|
"no", "not", "of", "on", "or", "such",
|
||||||
|
"that", "the", "their", "then", "there", "these",
|
||||||
|
"they", "this", "to", "was", "will", "with",
|
||||||
|
}
|
||||||
|
|
||||||
|
type StopWordsFilter struct {
|
||||||
|
stopWords map[string]bool
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewStopWordsFilter() (*StopWordsFilter, error) {
|
||||||
|
return &StopWordsFilter{
|
||||||
|
stopWords: buildStopWordMap(DEFAULT_STOP_WORDS),
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f *StopWordsFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||||
|
rv := make(analysis.TokenStream, 0)
|
||||||
|
|
||||||
|
for _, token := range input {
|
||||||
|
word := string(token.Term)
|
||||||
|
_, isStopWord := f.stopWords[word]
|
||||||
|
if !isStopWord {
|
||||||
|
rv = append(rv, token)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return rv
|
||||||
|
}
|
||||||
|
|
||||||
|
func buildStopWordMap(words []string) map[string]bool {
|
||||||
|
rv := make(map[string]bool, len(words))
|
||||||
|
for _, word := range words {
|
||||||
|
rv[word] = true
|
||||||
|
}
|
||||||
|
return rv
|
||||||
|
}
|
|
@ -0,0 +1,55 @@
|
||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
package stop_words_filter
|
||||||
|
|
||||||
|
import (
|
||||||
|
"reflect"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/couchbaselabs/bleve/analysis"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestStopWordsFilter(t *testing.T) {
|
||||||
|
|
||||||
|
inputTokenStream := analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("a"),
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("walk"),
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("in"),
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("the"),
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("park"),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
expectedTokenStream := analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("walk"),
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("park"),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
filter, err := NewStopWordsFilter()
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
ouputTokenStream := filter.Filter(inputTokenStream)
|
||||||
|
if !reflect.DeepEqual(ouputTokenStream, expectedTokenStream) {
|
||||||
|
t.Errorf("expected %#v got %#v", expectedTokenStream, ouputTokenStream)
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,40 @@
|
||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
package regexp_tokenizer
|
||||||
|
|
||||||
|
import (
|
||||||
|
"regexp"
|
||||||
|
|
||||||
|
"github.com/couchbaselabs/bleve/analysis"
|
||||||
|
)
|
||||||
|
|
||||||
|
type RegexpTokenizer struct {
|
||||||
|
r *regexp.Regexp
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewRegexpTokenizer(r *regexp.Regexp) *RegexpTokenizer {
|
||||||
|
return &RegexpTokenizer{
|
||||||
|
r: r,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (rt *RegexpTokenizer) Tokenize(input []byte) analysis.TokenStream {
|
||||||
|
matches := rt.r.FindAllIndex(input, -1)
|
||||||
|
rv := make(analysis.TokenStream, len(matches))
|
||||||
|
for i, match := range matches {
|
||||||
|
token := analysis.Token{
|
||||||
|
Term: input[match[0]:match[1]],
|
||||||
|
Start: match[0],
|
||||||
|
End: match[1],
|
||||||
|
Position: i + 1,
|
||||||
|
}
|
||||||
|
rv[i] = &token
|
||||||
|
}
|
||||||
|
return rv
|
||||||
|
}
|
|
@ -0,0 +1,29 @@
|
||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
package simple_word_boundary
|
||||||
|
|
||||||
|
import (
|
||||||
|
"regexp"
|
||||||
|
|
||||||
|
"github.com/couchbaselabs/bleve/analysis/tokenizers/regexp_tokenizer"
|
||||||
|
)
|
||||||
|
|
||||||
|
const wordPattern = `\w+`
|
||||||
|
|
||||||
|
var wordRegex = regexp.MustCompile(wordPattern)
|
||||||
|
|
||||||
|
type SimpleWordBoundaryTokenizer struct {
|
||||||
|
*regexp_tokenizer.RegexpTokenizer
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewSimpleWordBoundaryTokenizer() *SimpleWordBoundaryTokenizer {
|
||||||
|
return &SimpleWordBoundaryTokenizer{
|
||||||
|
regexp_tokenizer.NewRegexpTokenizer(wordRegex),
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,51 @@
|
||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
package simple_word_boundary
|
||||||
|
|
||||||
|
import (
|
||||||
|
"reflect"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/couchbaselabs/bleve/analysis"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestBoundary(t *testing.T) {
|
||||||
|
|
||||||
|
tests := []struct {
|
||||||
|
input []byte
|
||||||
|
output analysis.TokenStream
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
[]byte("Hello World."),
|
||||||
|
analysis.TokenStream{
|
||||||
|
{
|
||||||
|
0,
|
||||||
|
5,
|
||||||
|
[]byte("Hello"),
|
||||||
|
1,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
6,
|
||||||
|
11,
|
||||||
|
[]byte("World"),
|
||||||
|
2,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, test := range tests {
|
||||||
|
tokenizer := NewSimpleWordBoundaryTokenizer()
|
||||||
|
actual := tokenizer.Tokenize(test.input)
|
||||||
|
|
||||||
|
if !reflect.DeepEqual(actual, test.output) {
|
||||||
|
t.Errorf("Expected %v, got %v for %s", test.output, actual, string(test.input))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,31 @@
|
||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
package single_token
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/couchbaselabs/bleve/analysis"
|
||||||
|
)
|
||||||
|
|
||||||
|
type SingleTokenTokenizer struct {
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewSingleTokenTokenizer() *SingleTokenTokenizer {
|
||||||
|
return &SingleTokenTokenizer{}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (t *SingleTokenTokenizer) Tokenize(input []byte) analysis.TokenStream {
|
||||||
|
return analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: input,
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: len(input),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,67 @@
|
||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
package single_token
|
||||||
|
|
||||||
|
import (
|
||||||
|
"reflect"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/couchbaselabs/bleve/analysis"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestSingleTokenTokenizer(t *testing.T) {
|
||||||
|
|
||||||
|
tests := []struct {
|
||||||
|
input []byte
|
||||||
|
output analysis.TokenStream
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
[]byte("Hello World"),
|
||||||
|
analysis.TokenStream{
|
||||||
|
{
|
||||||
|
0,
|
||||||
|
11,
|
||||||
|
[]byte("Hello World"),
|
||||||
|
1,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
[]byte("こんにちは世界"),
|
||||||
|
analysis.TokenStream{
|
||||||
|
{
|
||||||
|
0,
|
||||||
|
21,
|
||||||
|
[]byte("こんにちは世界"),
|
||||||
|
1,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
[]byte("แยกคำภาษาไทยก็ทำได้นะจ้ะ"),
|
||||||
|
analysis.TokenStream{
|
||||||
|
{
|
||||||
|
0,
|
||||||
|
72,
|
||||||
|
[]byte("แยกคำภาษาไทยก็ทำได้นะจ้ะ"),
|
||||||
|
1,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, test := range tests {
|
||||||
|
tokenizer := NewSingleTokenTokenizer()
|
||||||
|
actual := tokenizer.Tokenize(test.input)
|
||||||
|
|
||||||
|
if !reflect.DeepEqual(actual, test.output) {
|
||||||
|
t.Errorf("Expected %v, got %v for %s", test.output, actual, string(test.input))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,114 @@
|
||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
package unicode_word_boundary
|
||||||
|
|
||||||
|
// #cgo pkg-config: icu-uc
|
||||||
|
// #include <stdio.h>
|
||||||
|
// #include <stdlib.h>
|
||||||
|
// #include "unicode/utypes.h"
|
||||||
|
// #include "unicode/uchar.h"
|
||||||
|
// #include "unicode/ubrk.h"
|
||||||
|
// #include "unicode/ustring.h"
|
||||||
|
import "C"
|
||||||
|
|
||||||
|
import "log"
|
||||||
|
import "unsafe"
|
||||||
|
import "github.com/couchbaselabs/bleve/analysis"
|
||||||
|
|
||||||
|
type UnicodeWordBoundaryTokenizer struct {
|
||||||
|
locale *C.char
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewUnicodeWordBoundaryTokenizer() *UnicodeWordBoundaryTokenizer {
|
||||||
|
return &UnicodeWordBoundaryTokenizer{}
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewUnicodeWordBoundaryCustomLocaleTokenizer(locale string) *UnicodeWordBoundaryTokenizer {
|
||||||
|
return &UnicodeWordBoundaryTokenizer{
|
||||||
|
locale: C.CString(locale),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (t *UnicodeWordBoundaryTokenizer) Tokenize(input []byte) analysis.TokenStream {
|
||||||
|
// var bi *C.UBreakIterator
|
||||||
|
rv := make(analysis.TokenStream, 0)
|
||||||
|
defer C.free(unsafe.Pointer(t.locale))
|
||||||
|
|
||||||
|
if len(input) < 1 {
|
||||||
|
return rv
|
||||||
|
}
|
||||||
|
|
||||||
|
// works
|
||||||
|
var myUnsafePointer = unsafe.Pointer(&(input[0]))
|
||||||
|
var myCCharPointer *C.char = (*C.char)(myUnsafePointer)
|
||||||
|
|
||||||
|
var inlen C.int32_t = C.int32_t(len(input))
|
||||||
|
var buflen C.int32_t = C.int32_t(2*len(input) + 1) // worse case each byte becomes 2
|
||||||
|
var stringToExamine []C.UChar = make([]C.UChar, buflen)
|
||||||
|
//log.Printf("new buff is: %v", stringToExamine)
|
||||||
|
var myUnsafePointerToExamine = unsafe.Pointer(&(stringToExamine[0]))
|
||||||
|
var myUCharPointer *C.UChar = (*C.UChar)(myUnsafePointerToExamine)
|
||||||
|
C.u_uastrncpy(myUCharPointer, myCCharPointer, inlen)
|
||||||
|
|
||||||
|
//log.Printf("after copy new buff is: %v", stringToExamine)
|
||||||
|
|
||||||
|
var err C.UErrorCode = C.U_ZERO_ERROR
|
||||||
|
bi := C.ubrk_open(C.UBRK_WORD, t.locale, myUCharPointer, -1, &err)
|
||||||
|
|
||||||
|
if err > C.U_ZERO_ERROR {
|
||||||
|
log.Printf("error opening boundary iterator")
|
||||||
|
return rv
|
||||||
|
}
|
||||||
|
|
||||||
|
defer C.ubrk_close(bi)
|
||||||
|
|
||||||
|
position := 0
|
||||||
|
var prev C.int32_t
|
||||||
|
p := C.ubrk_first(bi)
|
||||||
|
for p != C.UBRK_DONE {
|
||||||
|
|
||||||
|
q := C.ubrk_getRuleStatus(bi)
|
||||||
|
|
||||||
|
// convert boundaries back to utf8 positions
|
||||||
|
var nilCString *C.char
|
||||||
|
var indexA C.int32_t
|
||||||
|
|
||||||
|
C.u_strToUTF8(nilCString, 0, &indexA, myUCharPointer, prev, &err)
|
||||||
|
if err > C.U_ZERO_ERROR && err != C.U_BUFFER_OVERFLOW_ERROR {
|
||||||
|
log.Printf("error converting boundary %d", err)
|
||||||
|
return rv
|
||||||
|
} else {
|
||||||
|
err = C.U_ZERO_ERROR
|
||||||
|
}
|
||||||
|
|
||||||
|
var indexB C.int32_t
|
||||||
|
C.u_strToUTF8(nilCString, 0, &indexB, myUCharPointer, p, &err)
|
||||||
|
if err > C.U_ZERO_ERROR && err != C.U_BUFFER_OVERFLOW_ERROR {
|
||||||
|
log.Printf("error converting boundary %d", err)
|
||||||
|
return rv
|
||||||
|
} else {
|
||||||
|
err = C.U_ZERO_ERROR
|
||||||
|
}
|
||||||
|
|
||||||
|
if q != 0 {
|
||||||
|
position += 1
|
||||||
|
token := analysis.Token{
|
||||||
|
Start: int(indexA),
|
||||||
|
End: int(indexB),
|
||||||
|
Term: input[indexA:indexB],
|
||||||
|
Position: position,
|
||||||
|
}
|
||||||
|
rv = append(rv, &token)
|
||||||
|
}
|
||||||
|
prev = p
|
||||||
|
p = C.ubrk_next(bi)
|
||||||
|
}
|
||||||
|
|
||||||
|
return rv
|
||||||
|
}
|
|
@ -0,0 +1,125 @@
|
||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
package unicode_word_boundary
|
||||||
|
|
||||||
|
import (
|
||||||
|
"reflect"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/couchbaselabs/bleve/analysis"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestBoundary(t *testing.T) {
|
||||||
|
|
||||||
|
tests := []struct {
|
||||||
|
input []byte
|
||||||
|
locale string
|
||||||
|
output analysis.TokenStream
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
[]byte("Hello World"),
|
||||||
|
"en_US",
|
||||||
|
analysis.TokenStream{
|
||||||
|
{
|
||||||
|
0,
|
||||||
|
5,
|
||||||
|
[]byte("Hello"),
|
||||||
|
1,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
6,
|
||||||
|
11,
|
||||||
|
[]byte("World"),
|
||||||
|
2,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
[]byte("こんにちは世界"),
|
||||||
|
"en_US",
|
||||||
|
analysis.TokenStream{
|
||||||
|
{
|
||||||
|
0,
|
||||||
|
15,
|
||||||
|
[]byte("こんにちは"),
|
||||||
|
1,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
15,
|
||||||
|
21,
|
||||||
|
[]byte("世界"),
|
||||||
|
2,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
[]byte("แยกคำภาษาไทยก็ทำได้นะจ้ะ"),
|
||||||
|
"th_TH",
|
||||||
|
analysis.TokenStream{
|
||||||
|
{
|
||||||
|
0,
|
||||||
|
9,
|
||||||
|
[]byte("แยก"),
|
||||||
|
1,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
9,
|
||||||
|
15,
|
||||||
|
[]byte("คำ"),
|
||||||
|
2,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
15,
|
||||||
|
27,
|
||||||
|
[]byte("ภาษา"),
|
||||||
|
3,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
27,
|
||||||
|
36,
|
||||||
|
[]byte("ไทย"),
|
||||||
|
4,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
36,
|
||||||
|
42,
|
||||||
|
[]byte("ก็"),
|
||||||
|
5,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
42,
|
||||||
|
57,
|
||||||
|
[]byte("ทำได้"),
|
||||||
|
6,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
57,
|
||||||
|
63,
|
||||||
|
[]byte("นะ"),
|
||||||
|
7,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
63,
|
||||||
|
72,
|
||||||
|
[]byte("จ้ะ"),
|
||||||
|
8,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, test := range tests {
|
||||||
|
tokenizer := NewUnicodeWordBoundaryCustomLocaleTokenizer(test.locale)
|
||||||
|
actual := tokenizer.Tokenize(test.input)
|
||||||
|
|
||||||
|
if !reflect.DeepEqual(actual, test.output) {
|
||||||
|
t.Errorf("Expected %v, got %v for %s", test.output, actual, string(test.input))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,59 @@
|
||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
package analysis
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
)
|
||||||
|
|
||||||
|
type CharFilter interface {
|
||||||
|
Filter([]byte) []byte
|
||||||
|
}
|
||||||
|
|
||||||
|
type Token struct {
|
||||||
|
Start int
|
||||||
|
End int
|
||||||
|
Term []byte
|
||||||
|
Position int
|
||||||
|
}
|
||||||
|
|
||||||
|
func (t *Token) String() string {
|
||||||
|
return fmt.Sprintf("Start: %d End: %d Position: %d Token: %s", t.Start, t.End, t.Position, string(t.Term))
|
||||||
|
}
|
||||||
|
|
||||||
|
type TokenStream []*Token
|
||||||
|
|
||||||
|
type Tokenizer interface {
|
||||||
|
Tokenize([]byte) TokenStream
|
||||||
|
}
|
||||||
|
|
||||||
|
type TokenFilter interface {
|
||||||
|
Filter(TokenStream) TokenStream
|
||||||
|
}
|
||||||
|
|
||||||
|
type Analyzer struct {
|
||||||
|
CharFilters []CharFilter
|
||||||
|
Tokenizer Tokenizer
|
||||||
|
TokenFilters []TokenFilter
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *Analyzer) Analyze(input []byte) TokenStream {
|
||||||
|
if a.CharFilters != nil {
|
||||||
|
for _, cf := range a.CharFilters {
|
||||||
|
input = cf.Filter(input)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
tokens := a.Tokenizer.Tokenize(input)
|
||||||
|
if a.TokenFilters != nil {
|
||||||
|
for _, tf := range a.TokenFilters {
|
||||||
|
tokens = tf.Filter(tokens)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return tokens
|
||||||
|
}
|
|
@ -0,0 +1,34 @@
|
||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
package document
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/json"
|
||||||
|
)
|
||||||
|
|
||||||
|
type Document struct {
|
||||||
|
ID string `json:"id"`
|
||||||
|
Fields []*Field `json:"fields"`
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewDocument(id string) *Document {
|
||||||
|
return &Document{
|
||||||
|
ID: id,
|
||||||
|
Fields: make([]*Field, 0),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (d *Document) AddField(f *Field) {
|
||||||
|
d.Fields = append(d.Fields, f)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (d *Document) String() string {
|
||||||
|
bytes, _ := json.MarshalIndent(d, "", " ")
|
||||||
|
return string(bytes)
|
||||||
|
}
|
|
@ -0,0 +1,29 @@
|
||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
package document
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/couchbaselabs/bleve/analysis"
|
||||||
|
)
|
||||||
|
|
||||||
|
type Field struct {
|
||||||
|
Name string
|
||||||
|
IndexingOptions int
|
||||||
|
Analyzer *analysis.Analyzer
|
||||||
|
Value []byte
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewField(name string, value []byte, indexingOptions int, analyzer *analysis.Analyzer) *Field {
|
||||||
|
return &Field{
|
||||||
|
Name: name,
|
||||||
|
IndexingOptions: indexingOptions,
|
||||||
|
Analyzer: analyzer,
|
||||||
|
Value: value,
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,41 @@
|
||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
package document
|
||||||
|
|
||||||
|
import (
|
||||||
|
"log"
|
||||||
|
|
||||||
|
"github.com/couchbaselabs/bleve/analysis"
|
||||||
|
"github.com/couchbaselabs/bleve/analysis/analyzers/standard_analyzer"
|
||||||
|
)
|
||||||
|
|
||||||
|
var standardAnalyzer *analysis.Analyzer
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
var err error
|
||||||
|
standardAnalyzer, err = standard_analyzer.NewStandardAnalyzer()
|
||||||
|
if err != nil {
|
||||||
|
log.Fatal(err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const DEFAULT_TEXT_INDEXING_OPTIONS = INDEX_FIELD
|
||||||
|
|
||||||
|
func NewTextField(name string, value []byte) *Field {
|
||||||
|
return NewTextFieldWithIndexingOptions(name, value, DEFAULT_TEXT_INDEXING_OPTIONS)
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewTextFieldWithIndexingOptions(name string, value []byte, indexingOptions int) *Field {
|
||||||
|
return &Field{
|
||||||
|
Name: name,
|
||||||
|
IndexingOptions: indexingOptions,
|
||||||
|
Analyzer: standardAnalyzer,
|
||||||
|
Value: value,
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,27 @@
|
||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
package document
|
||||||
|
|
||||||
|
const (
|
||||||
|
INDEX_FIELD = 1 << iota
|
||||||
|
STORE_FIELD
|
||||||
|
INCLUDE_TERM_VECTORS
|
||||||
|
)
|
||||||
|
|
||||||
|
func IsIndexedField(arg int) bool {
|
||||||
|
return arg&INDEX_FIELD != 0
|
||||||
|
}
|
||||||
|
|
||||||
|
func IsStoredField(arg int) bool {
|
||||||
|
return arg&STORE_FIELD != 0
|
||||||
|
}
|
||||||
|
|
||||||
|
func IncludeTermVectors(arg int) bool {
|
||||||
|
return arg&INCLUDE_TERM_VECTORS != 0
|
||||||
|
}
|
|
@ -0,0 +1,68 @@
|
||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
package document
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestIndexingOptions(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
indexingOptions int
|
||||||
|
isIndexed bool
|
||||||
|
isStored bool
|
||||||
|
includeTermVectors bool
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
indexingOptions: INDEX_FIELD | STORE_FIELD | INCLUDE_TERM_VECTORS,
|
||||||
|
isIndexed: true,
|
||||||
|
isStored: true,
|
||||||
|
includeTermVectors: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
indexingOptions: INDEX_FIELD | INCLUDE_TERM_VECTORS,
|
||||||
|
isIndexed: true,
|
||||||
|
isStored: false,
|
||||||
|
includeTermVectors: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
indexingOptions: STORE_FIELD | INCLUDE_TERM_VECTORS,
|
||||||
|
isIndexed: false,
|
||||||
|
isStored: true,
|
||||||
|
includeTermVectors: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
indexingOptions: INDEX_FIELD,
|
||||||
|
isIndexed: true,
|
||||||
|
isStored: false,
|
||||||
|
includeTermVectors: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
indexingOptions: STORE_FIELD,
|
||||||
|
isIndexed: false,
|
||||||
|
isStored: true,
|
||||||
|
includeTermVectors: false,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, test := range tests {
|
||||||
|
actuallyIndexed := IsIndexedField(test.indexingOptions)
|
||||||
|
if actuallyIndexed != test.isIndexed {
|
||||||
|
t.Errorf("expected indexed to be %v, got %v for %d", test.isIndexed, actuallyIndexed, test.indexingOptions)
|
||||||
|
}
|
||||||
|
actuallyStored := IsStoredField(test.indexingOptions)
|
||||||
|
if actuallyStored != test.isStored {
|
||||||
|
t.Errorf("expected stored to be %v, got %v for %d", test.isStored, actuallyStored, test.indexingOptions)
|
||||||
|
}
|
||||||
|
actuallyIncludeTermVectors := IncludeTermVectors(test.indexingOptions)
|
||||||
|
if actuallyIncludeTermVectors != test.includeTermVectors {
|
||||||
|
t.Errorf("expected includeTermVectors to be %v, got %v for %d", test.includeTermVectors, actuallyIncludeTermVectors, test.indexingOptions)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,63 @@
|
||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"flag"
|
||||||
|
"io/ioutil"
|
||||||
|
"log"
|
||||||
|
|
||||||
|
"github.com/couchbaselabs/bleve/index/upside_down"
|
||||||
|
"github.com/couchbaselabs/bleve/shredder"
|
||||||
|
)
|
||||||
|
|
||||||
|
var jsonDir = flag.String("jsonDir", "json", "json directory")
|
||||||
|
var indexDir = flag.String("indexDir", "index", "index directory")
|
||||||
|
|
||||||
|
func main() {
|
||||||
|
|
||||||
|
flag.Parse()
|
||||||
|
|
||||||
|
// create a automatic JSON document shredder
|
||||||
|
jsonShredder := shredder.NewAutoJsonShredder()
|
||||||
|
|
||||||
|
// create a new index
|
||||||
|
index := upside_down.NewUpsideDownCouch(*indexDir)
|
||||||
|
err := index.Open()
|
||||||
|
if err != nil {
|
||||||
|
log.Fatal(err)
|
||||||
|
}
|
||||||
|
defer index.Close()
|
||||||
|
|
||||||
|
// open the directory
|
||||||
|
dirEntries, err := ioutil.ReadDir(*jsonDir)
|
||||||
|
if err != nil {
|
||||||
|
log.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// walk the directory entries
|
||||||
|
for _, dirEntry := range dirEntries {
|
||||||
|
// read the bytes
|
||||||
|
jsonBytes, err := ioutil.ReadFile(*jsonDir + "/" + dirEntry.Name())
|
||||||
|
if err != nil {
|
||||||
|
log.Fatal(err)
|
||||||
|
}
|
||||||
|
// shred them into a document
|
||||||
|
doc, err := jsonShredder.Shred(dirEntry.Name(), jsonBytes)
|
||||||
|
if err != nil {
|
||||||
|
log.Fatal(err)
|
||||||
|
}
|
||||||
|
//log.Printf("%+v", doc)
|
||||||
|
// update the index
|
||||||
|
err = index.Update(doc)
|
||||||
|
if err != nil {
|
||||||
|
log.Fatal(err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,70 @@
|
||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"flag"
|
||||||
|
"fmt"
|
||||||
|
"log"
|
||||||
|
|
||||||
|
"github.com/couchbaselabs/bleve/index/upside_down"
|
||||||
|
"github.com/couchbaselabs/bleve/search"
|
||||||
|
)
|
||||||
|
|
||||||
|
var field = flag.String("field", "description", "field to query")
|
||||||
|
var indexDir = flag.String("indexDir", "index", "index directory")
|
||||||
|
var limit = flag.Int("limit", 10, "limit to first N results")
|
||||||
|
|
||||||
|
func main() {
|
||||||
|
|
||||||
|
flag.Parse()
|
||||||
|
|
||||||
|
if flag.NArg() < 1 {
|
||||||
|
log.Fatal("Specify search term")
|
||||||
|
}
|
||||||
|
|
||||||
|
// open index
|
||||||
|
index := upside_down.NewUpsideDownCouch(*indexDir)
|
||||||
|
err := index.Open()
|
||||||
|
if err != nil {
|
||||||
|
log.Fatal(err)
|
||||||
|
}
|
||||||
|
defer index.Close()
|
||||||
|
|
||||||
|
tq := search.TermQuery{
|
||||||
|
Term: flag.Arg(0),
|
||||||
|
Field: *field,
|
||||||
|
BoostVal: 1.0,
|
||||||
|
Explain: true,
|
||||||
|
}
|
||||||
|
collector := search.NewTopScorerCollector(*limit)
|
||||||
|
searcher, err := tq.Searcher(index)
|
||||||
|
if err != nil {
|
||||||
|
log.Fatalf("searcher error: %v", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
err = collector.Collect(searcher)
|
||||||
|
if err != nil {
|
||||||
|
log.Fatalf("search error: %v", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
results := collector.Results()
|
||||||
|
if len(results) == 0 {
|
||||||
|
fmt.Printf("No matches\n")
|
||||||
|
} else {
|
||||||
|
last := uint64(*limit)
|
||||||
|
if searcher.Count() < last {
|
||||||
|
last = searcher.Count()
|
||||||
|
}
|
||||||
|
fmt.Printf("%d matches, showing %d through %d\n", searcher.Count(), 1, last)
|
||||||
|
for i, result := range results {
|
||||||
|
fmt.Printf("%2d. %s (%f)\n", i+1, result.ID, result.Score)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,48 @@
|
||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
package index
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/couchbaselabs/bleve/document"
|
||||||
|
)
|
||||||
|
|
||||||
|
type Index interface {
|
||||||
|
Open() error
|
||||||
|
Close()
|
||||||
|
|
||||||
|
Update(doc *document.Document) error
|
||||||
|
Delete(id string) error
|
||||||
|
|
||||||
|
TermFieldReader(term []byte, field string) (TermFieldReader, error)
|
||||||
|
|
||||||
|
DocCount() uint64
|
||||||
|
|
||||||
|
Dump()
|
||||||
|
}
|
||||||
|
|
||||||
|
type TermFieldVector struct {
|
||||||
|
Field string
|
||||||
|
Pos uint64
|
||||||
|
Start uint64
|
||||||
|
End uint64
|
||||||
|
}
|
||||||
|
|
||||||
|
type TermFieldDoc struct {
|
||||||
|
ID string
|
||||||
|
Freq uint64
|
||||||
|
Norm float64
|
||||||
|
Vectors []*TermFieldVector
|
||||||
|
}
|
||||||
|
|
||||||
|
type TermFieldReader interface {
|
||||||
|
Next() (*TermFieldDoc, error)
|
||||||
|
Advance(ID string) (*TermFieldDoc, error)
|
||||||
|
Count() uint64
|
||||||
|
Close()
|
||||||
|
}
|
|
@ -0,0 +1,227 @@
|
||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
package mock
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"math"
|
||||||
|
"sort"
|
||||||
|
|
||||||
|
"github.com/couchbaselabs/bleve/analysis"
|
||||||
|
"github.com/couchbaselabs/bleve/document"
|
||||||
|
"github.com/couchbaselabs/bleve/index"
|
||||||
|
)
|
||||||
|
|
||||||
|
type mockFreq struct {
|
||||||
|
freq uint64
|
||||||
|
norm float64
|
||||||
|
vectors []*index.TermFieldVector
|
||||||
|
}
|
||||||
|
|
||||||
|
// key doc id
|
||||||
|
type mockDocFreq map[string]*mockFreq
|
||||||
|
|
||||||
|
//key field
|
||||||
|
type mockFieldDocFreq map[string]mockDocFreq
|
||||||
|
|
||||||
|
// 2 dim array
|
||||||
|
// inner level are always pairs (field name, term)
|
||||||
|
type mockBackIndexEntry [][]string
|
||||||
|
|
||||||
|
type MockIndex struct {
|
||||||
|
|
||||||
|
//this level of the map, the key is the term
|
||||||
|
termIndex map[string]mockFieldDocFreq
|
||||||
|
|
||||||
|
// key is docid
|
||||||
|
backIndex map[string]mockBackIndexEntry
|
||||||
|
|
||||||
|
docCount uint64
|
||||||
|
analyzer map[string]*analysis.Analyzer
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewMockIndexWithDocs(docs []*document.Document) *MockIndex {
|
||||||
|
rv := NewMockIndex()
|
||||||
|
for _, doc := range docs {
|
||||||
|
rv.Update(doc)
|
||||||
|
}
|
||||||
|
return rv
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewMockIndex() *MockIndex {
|
||||||
|
mi := MockIndex{
|
||||||
|
termIndex: make(map[string]mockFieldDocFreq),
|
||||||
|
backIndex: make(map[string]mockBackIndexEntry),
|
||||||
|
analyzer: make(map[string]*analysis.Analyzer),
|
||||||
|
}
|
||||||
|
|
||||||
|
return &mi
|
||||||
|
}
|
||||||
|
|
||||||
|
func (index *MockIndex) Open() error {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (index *MockIndex) Close() {}
|
||||||
|
|
||||||
|
// for this implementation we dont care about performance
|
||||||
|
// update is simply delete then add
|
||||||
|
func (index *MockIndex) Update(doc *document.Document) error {
|
||||||
|
index.Delete(doc.ID)
|
||||||
|
|
||||||
|
backIndexEntry := make(mockBackIndexEntry, 0)
|
||||||
|
for _, field := range doc.Fields {
|
||||||
|
|
||||||
|
analyzer := field.Analyzer
|
||||||
|
tokens := analyzer.Analyze(field.Value)
|
||||||
|
fieldLength := len(tokens) // number of tokens in this doc field
|
||||||
|
fieldNorm := 1.0 / math.Sqrt(float64(fieldLength))
|
||||||
|
tokenFreqs := analysis.TokenFrequency(tokens)
|
||||||
|
for _, tf := range tokenFreqs {
|
||||||
|
mf := mockFreq{
|
||||||
|
freq: uint64(len(tf.Locations)),
|
||||||
|
norm: fieldNorm,
|
||||||
|
}
|
||||||
|
if document.IncludeTermVectors(field.IndexingOptions) {
|
||||||
|
mf.vectors = index.mockVectorsFromTokenFreq(field.Name, tf)
|
||||||
|
}
|
||||||
|
termString := string(tf.Term)
|
||||||
|
fieldMap, ok := index.termIndex[termString]
|
||||||
|
if !ok {
|
||||||
|
fieldMap = make(map[string]mockDocFreq)
|
||||||
|
index.termIndex[termString] = fieldMap
|
||||||
|
}
|
||||||
|
docMap, ok := fieldMap[field.Name]
|
||||||
|
if !ok {
|
||||||
|
docMap = make(map[string]*mockFreq)
|
||||||
|
fieldMap[field.Name] = docMap
|
||||||
|
}
|
||||||
|
docMap[doc.ID] = &mf
|
||||||
|
backIndexInnerEntry := []string{field.Name, termString}
|
||||||
|
backIndexEntry = append(backIndexEntry, backIndexInnerEntry)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
index.backIndex[doc.ID] = backIndexEntry
|
||||||
|
index.docCount += 1
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (index *MockIndex) Delete(id string) error {
|
||||||
|
backIndexEntry, existed := index.backIndex[id]
|
||||||
|
if existed {
|
||||||
|
for _, backIndexPair := range backIndexEntry {
|
||||||
|
if len(backIndexPair) == 2 {
|
||||||
|
field := backIndexPair[0]
|
||||||
|
term := backIndexPair[1]
|
||||||
|
delete(index.termIndex[term][field], id)
|
||||||
|
if len(index.termIndex[term][field]) == 0 {
|
||||||
|
delete(index.termIndex[term], field)
|
||||||
|
if len(index.termIndex[term]) == 0 {
|
||||||
|
delete(index.termIndex, term)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
delete(index.backIndex, id)
|
||||||
|
index.docCount -= 1
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (index *MockIndex) TermFieldReader(term []byte, field string) (index.TermFieldReader, error) {
|
||||||
|
|
||||||
|
fdf, ok := index.termIndex[string(term)]
|
||||||
|
if !ok {
|
||||||
|
fdf = make(mockFieldDocFreq)
|
||||||
|
}
|
||||||
|
docFreqs, ok := fdf[field]
|
||||||
|
if !ok {
|
||||||
|
docFreqs = make(mockDocFreq)
|
||||||
|
}
|
||||||
|
mtfr := mockTermFieldReader{
|
||||||
|
index: docFreqs,
|
||||||
|
sortedDocIds: make(sort.StringSlice, len(docFreqs)),
|
||||||
|
curr: -1,
|
||||||
|
}
|
||||||
|
i := 0
|
||||||
|
for k, _ := range docFreqs {
|
||||||
|
mtfr.sortedDocIds[i] = k
|
||||||
|
i += 1
|
||||||
|
}
|
||||||
|
sort.Sort(mtfr.sortedDocIds)
|
||||||
|
|
||||||
|
return &mtfr, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (index *MockIndex) DocCount() uint64 {
|
||||||
|
return index.docCount
|
||||||
|
}
|
||||||
|
|
||||||
|
type mockTermFieldReader struct {
|
||||||
|
index mockDocFreq
|
||||||
|
sortedDocIds sort.StringSlice
|
||||||
|
curr int
|
||||||
|
}
|
||||||
|
|
||||||
|
func (reader *mockTermFieldReader) Next() (*index.TermFieldDoc, error) {
|
||||||
|
next := reader.curr + 1
|
||||||
|
if next < len(reader.sortedDocIds) {
|
||||||
|
nextTermKey := reader.sortedDocIds[next]
|
||||||
|
nextTerm := reader.index[nextTermKey]
|
||||||
|
reader.curr = next
|
||||||
|
return &index.TermFieldDoc{ID: nextTermKey, Freq: nextTerm.freq, Norm: nextTerm.norm, Vectors: nextTerm.vectors}, nil
|
||||||
|
}
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (reader *mockTermFieldReader) Advance(ID string) (*index.TermFieldDoc, error) {
|
||||||
|
if reader.curr >= len(reader.sortedDocIds) {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
i := reader.curr
|
||||||
|
for currTermID := reader.sortedDocIds[i]; currTermID < ID && i < len(reader.sortedDocIds); i += 1 {
|
||||||
|
reader.curr = i
|
||||||
|
currTermID = reader.sortedDocIds[reader.curr]
|
||||||
|
}
|
||||||
|
|
||||||
|
if reader.curr < len(reader.sortedDocIds) {
|
||||||
|
nextTermKey := reader.sortedDocIds[reader.curr]
|
||||||
|
nextTerm := reader.index[nextTermKey]
|
||||||
|
return &index.TermFieldDoc{ID: nextTermKey, Freq: nextTerm.freq, Norm: nextTerm.norm, Vectors: nextTerm.vectors}, nil
|
||||||
|
}
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (reader *mockTermFieldReader) Count() uint64 {
|
||||||
|
return uint64(len(reader.sortedDocIds))
|
||||||
|
}
|
||||||
|
|
||||||
|
func (reader *mockTermFieldReader) Close() {}
|
||||||
|
|
||||||
|
func (mi *MockIndex) mockVectorsFromTokenFreq(field string, tf *analysis.TokenFreq) []*index.TermFieldVector {
|
||||||
|
rv := make([]*index.TermFieldVector, len(tf.Locations))
|
||||||
|
|
||||||
|
for i, l := range tf.Locations {
|
||||||
|
mv := index.TermFieldVector{
|
||||||
|
Field: field,
|
||||||
|
Pos: uint64(l.Position),
|
||||||
|
Start: uint64(l.Start),
|
||||||
|
End: uint64(l.End),
|
||||||
|
}
|
||||||
|
rv[i] = &mv
|
||||||
|
}
|
||||||
|
|
||||||
|
return rv
|
||||||
|
}
|
||||||
|
|
||||||
|
func (mi *MockIndex) Dump() {
|
||||||
|
fmt.Println("dump not implemented")
|
||||||
|
}
|
|
@ -0,0 +1,124 @@
|
||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
package mock
|
||||||
|
|
||||||
|
import (
|
||||||
|
"reflect"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
_ "github.com/couchbaselabs/bleve/analysis/analyzers/standard_analyzer"
|
||||||
|
"github.com/couchbaselabs/bleve/document"
|
||||||
|
"github.com/couchbaselabs/bleve/index"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestCRUD(t *testing.T) {
|
||||||
|
i := NewMockIndex()
|
||||||
|
|
||||||
|
// create doc, assert doc count goes up
|
||||||
|
doc1 := document.NewDocument("1")
|
||||||
|
doc1.AddField(document.NewTextField("name", []byte("marty")))
|
||||||
|
i.Update(doc1)
|
||||||
|
count := i.DocCount()
|
||||||
|
if count != 1 {
|
||||||
|
t.Errorf("expected document count to be 1, was: %d", count)
|
||||||
|
}
|
||||||
|
|
||||||
|
// add another doc, assert doc count goes up again
|
||||||
|
doc2 := document.NewDocument("2")
|
||||||
|
doc2.AddField(document.NewTextField("name", []byte("bob")))
|
||||||
|
i.Update(doc2)
|
||||||
|
count = i.DocCount()
|
||||||
|
if count != 2 {
|
||||||
|
t.Errorf("expected document count to be 2, was: %d", count)
|
||||||
|
}
|
||||||
|
|
||||||
|
// search for doc with term that should exist
|
||||||
|
expectedMatch := &index.TermFieldDoc{
|
||||||
|
ID: "1",
|
||||||
|
Freq: 1,
|
||||||
|
Norm: 1,
|
||||||
|
}
|
||||||
|
tfr, err := i.TermFieldReader([]byte("marty"), "name")
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf("unexpected error: %v", err)
|
||||||
|
}
|
||||||
|
match, err := tfr.Next()
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf("unexpected error: %v", err)
|
||||||
|
}
|
||||||
|
if !reflect.DeepEqual(expectedMatch, match) {
|
||||||
|
t.Errorf("got %v, expected %v", match, expectedMatch)
|
||||||
|
}
|
||||||
|
nomatch, err := tfr.Next()
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf("unexpected error: %v", err)
|
||||||
|
}
|
||||||
|
if nomatch != nil {
|
||||||
|
t.Errorf("expected nil after last match")
|
||||||
|
}
|
||||||
|
|
||||||
|
// update doc, assert doc count doesn't go up
|
||||||
|
doc1 = document.NewDocument("1")
|
||||||
|
doc1.AddField(document.NewTextField("name", []byte("salad")))
|
||||||
|
doc1.AddField(document.NewTextFieldWithIndexingOptions("desc", []byte("eat more rice"), document.INDEX_FIELD|document.INCLUDE_TERM_VECTORS))
|
||||||
|
i.Update(doc1)
|
||||||
|
count = i.DocCount()
|
||||||
|
if count != 2 {
|
||||||
|
t.Errorf("expected document count to be 2, was: %d", count)
|
||||||
|
}
|
||||||
|
|
||||||
|
// perform the original search again, should NOT find anything this time
|
||||||
|
tfr, err = i.TermFieldReader([]byte("marty"), "name")
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf("unexpected error: %v", err)
|
||||||
|
}
|
||||||
|
nomatch, err = tfr.Next()
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf("unexpected error: %v", err)
|
||||||
|
}
|
||||||
|
if nomatch != nil {
|
||||||
|
t.Errorf("expected no matches, found one")
|
||||||
|
t.Logf("%v", i)
|
||||||
|
}
|
||||||
|
|
||||||
|
// delete a doc, ensure the count is 1
|
||||||
|
err = i.Delete("2")
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf("unexpected error: %v", err)
|
||||||
|
}
|
||||||
|
count = i.DocCount()
|
||||||
|
if count != 1 {
|
||||||
|
t.Errorf("expected document count to be 1, was: %d", count)
|
||||||
|
}
|
||||||
|
|
||||||
|
expectedMatch = &index.TermFieldDoc{
|
||||||
|
ID: "1",
|
||||||
|
Freq: 1,
|
||||||
|
Norm: 0.5773502691896258,
|
||||||
|
Vectors: []*index.TermFieldVector{
|
||||||
|
&index.TermFieldVector{
|
||||||
|
Field: "desc",
|
||||||
|
Pos: 3,
|
||||||
|
Start: 9,
|
||||||
|
End: 13,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
tfr, err = i.TermFieldReader([]byte("rice"), "desc")
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf("unexpected error: %v", err)
|
||||||
|
}
|
||||||
|
match, err = tfr.Next()
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf("unexpected error: %v", err)
|
||||||
|
}
|
||||||
|
if !reflect.DeepEqual(expectedMatch, match) {
|
||||||
|
t.Errorf("got %#v, expected %#v", match, expectedMatch)
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,101 @@
|
||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
package upside_down
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
|
||||||
|
"github.com/jmhodges/levigo"
|
||||||
|
|
||||||
|
"github.com/couchbaselabs/bleve/index"
|
||||||
|
)
|
||||||
|
|
||||||
|
type UpsideDownCouchTermFieldReader struct {
|
||||||
|
index *UpsideDownCouch
|
||||||
|
iterator *levigo.Iterator
|
||||||
|
count uint64
|
||||||
|
term []byte
|
||||||
|
field uint16
|
||||||
|
}
|
||||||
|
|
||||||
|
func newUpsideDownCouchTermFieldReader(index *UpsideDownCouch, term []byte, field uint16) (*UpsideDownCouchTermFieldReader, error) {
|
||||||
|
ro := defaultReadOptions()
|
||||||
|
it := index.db.NewIterator(ro)
|
||||||
|
|
||||||
|
tfr := NewTermFrequencyRow(term, field, "", 0, 0)
|
||||||
|
it.Seek(tfr.Key())
|
||||||
|
|
||||||
|
var count uint64 = 0
|
||||||
|
if it.Valid() {
|
||||||
|
if bytes.Equal(it.Key(), tfr.Key()) {
|
||||||
|
tfr = ParseFromKeyValue(it.Key(), it.Value()).(*TermFrequencyRow)
|
||||||
|
count = tfr.freq
|
||||||
|
}
|
||||||
|
|
||||||
|
} else {
|
||||||
|
return nil, it.GetError()
|
||||||
|
}
|
||||||
|
|
||||||
|
return &UpsideDownCouchTermFieldReader{
|
||||||
|
index: index,
|
||||||
|
iterator: it,
|
||||||
|
count: count,
|
||||||
|
term: term,
|
||||||
|
field: field,
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *UpsideDownCouchTermFieldReader) Count() uint64 {
|
||||||
|
return r.count
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *UpsideDownCouchTermFieldReader) Next() (*index.TermFieldDoc, error) {
|
||||||
|
r.iterator.Next()
|
||||||
|
if r.iterator.Valid() {
|
||||||
|
tfr := NewTermFrequencyRow(r.term, r.field, "", 0, 0)
|
||||||
|
if !bytes.HasPrefix(r.iterator.Key(), tfr.Key()) {
|
||||||
|
// end of the line
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
tfr = ParseFromKeyValue(r.iterator.Key(), r.iterator.Value()).(*TermFrequencyRow)
|
||||||
|
return &index.TermFieldDoc{
|
||||||
|
ID: string(tfr.doc),
|
||||||
|
Freq: tfr.freq,
|
||||||
|
Norm: float64(tfr.norm),
|
||||||
|
Vectors: r.index.termFieldVectorsFromTermVectors(tfr.vectors),
|
||||||
|
}, nil
|
||||||
|
} else {
|
||||||
|
return nil, r.iterator.GetError()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *UpsideDownCouchTermFieldReader) Advance(docId string) (*index.TermFieldDoc, error) {
|
||||||
|
tfr := NewTermFrequencyRow(r.term, r.field, docId, 0, 0)
|
||||||
|
r.iterator.Seek(tfr.Key())
|
||||||
|
if r.iterator.Valid() {
|
||||||
|
tfr := NewTermFrequencyRow(r.term, r.field, "", 0, 0)
|
||||||
|
if !bytes.HasPrefix(r.iterator.Key(), tfr.Key()) {
|
||||||
|
// end of the line
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
tfr = ParseFromKeyValue(r.iterator.Key(), r.iterator.Value()).(*TermFrequencyRow)
|
||||||
|
return &index.TermFieldDoc{
|
||||||
|
ID: string(tfr.doc),
|
||||||
|
Freq: tfr.freq,
|
||||||
|
Norm: float64(tfr.norm),
|
||||||
|
Vectors: r.index.termFieldVectorsFromTermVectors(tfr.vectors),
|
||||||
|
}, nil
|
||||||
|
} else {
|
||||||
|
return nil, r.iterator.GetError()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *UpsideDownCouchTermFieldReader) Close() {
|
||||||
|
r.iterator.Close()
|
||||||
|
}
|
|
@ -0,0 +1,111 @@
|
||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
package upside_down
|
||||||
|
|
||||||
|
import (
|
||||||
|
"os"
|
||||||
|
"reflect"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
_ "github.com/couchbaselabs/bleve/analysis/analyzers/standard_analyzer"
|
||||||
|
"github.com/couchbaselabs/bleve/document"
|
||||||
|
"github.com/couchbaselabs/bleve/index"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestIndexReader(t *testing.T) {
|
||||||
|
defer os.RemoveAll("test")
|
||||||
|
|
||||||
|
idx := NewUpsideDownCouch("test")
|
||||||
|
|
||||||
|
err := idx.Open()
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf("error opening index: %v", err)
|
||||||
|
}
|
||||||
|
defer idx.Close()
|
||||||
|
|
||||||
|
var expectedCount uint64 = 0
|
||||||
|
doc := document.NewDocument("1")
|
||||||
|
doc.AddField(document.NewTextField("name", []byte("test")))
|
||||||
|
err = idx.Update(doc)
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf("Error updating index: %v", err)
|
||||||
|
}
|
||||||
|
expectedCount += 1
|
||||||
|
|
||||||
|
doc = document.NewDocument("2")
|
||||||
|
doc.AddField(document.NewTextField("name", []byte("test test test")))
|
||||||
|
doc.AddField(document.NewTextFieldWithIndexingOptions("desc", []byte("eat more rice"), document.INDEX_FIELD|document.INCLUDE_TERM_VECTORS))
|
||||||
|
err = idx.Update(doc)
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf("Error updating index: %v", err)
|
||||||
|
}
|
||||||
|
expectedCount += 1
|
||||||
|
|
||||||
|
// first look for a term that doesnt exist
|
||||||
|
reader, err := idx.TermFieldReader([]byte("nope"), "name")
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf("Error accessing term field reader: %v", err)
|
||||||
|
}
|
||||||
|
count := reader.Count()
|
||||||
|
if count != 0 {
|
||||||
|
t.Errorf("Expected doc count to be: %d got: %d", 0, count)
|
||||||
|
}
|
||||||
|
reader.Close()
|
||||||
|
|
||||||
|
reader, err = idx.TermFieldReader([]byte("test"), "name")
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf("Error accessing term field reader: %v", err)
|
||||||
|
}
|
||||||
|
defer reader.Close()
|
||||||
|
|
||||||
|
expectedCount = 2
|
||||||
|
count = reader.Count()
|
||||||
|
if count != expectedCount {
|
||||||
|
t.Errorf("Exptected doc count to be: %d got: %d", expectedCount, count)
|
||||||
|
}
|
||||||
|
|
||||||
|
var match *index.TermFieldDoc
|
||||||
|
var actualCount uint64
|
||||||
|
match, err = reader.Next()
|
||||||
|
for err == nil && match != nil {
|
||||||
|
match, err = reader.Next()
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf("unexpected error reading next")
|
||||||
|
}
|
||||||
|
actualCount += 1
|
||||||
|
}
|
||||||
|
if actualCount != count {
|
||||||
|
t.Errorf("count was 2, but only saw %d", actualCount)
|
||||||
|
}
|
||||||
|
|
||||||
|
expectedMatch := &index.TermFieldDoc{
|
||||||
|
ID: "2",
|
||||||
|
Freq: 1,
|
||||||
|
Norm: 0.5773502588272095,
|
||||||
|
Vectors: []*index.TermFieldVector{
|
||||||
|
&index.TermFieldVector{
|
||||||
|
Field: "desc",
|
||||||
|
Pos: 3,
|
||||||
|
Start: 9,
|
||||||
|
End: 13,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
tfr, err := idx.TermFieldReader([]byte("rice"), "desc")
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf("unexpected error: %v", err)
|
||||||
|
}
|
||||||
|
match, err = tfr.Next()
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf("unexpected error: %v", err)
|
||||||
|
}
|
||||||
|
if !reflect.DeepEqual(expectedMatch, match) {
|
||||||
|
t.Errorf("got %#v, expected %#v", match, expectedMatch)
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,412 @@
|
||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
package upside_down
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"encoding/binary"
|
||||||
|
"fmt"
|
||||||
|
"io"
|
||||||
|
)
|
||||||
|
|
||||||
|
const BYTE_SEPARATOR byte = 0xff
|
||||||
|
|
||||||
|
type UpsideDownCouchRowStream chan UpsideDownCouchRow
|
||||||
|
|
||||||
|
type UpsideDownCouchRow interface {
|
||||||
|
Key() []byte
|
||||||
|
Value() []byte
|
||||||
|
}
|
||||||
|
|
||||||
|
func ParseFromKeyValue(key, value []byte) UpsideDownCouchRow {
|
||||||
|
switch key[0] {
|
||||||
|
case 'v':
|
||||||
|
return NewVersionRowKV(key, value)
|
||||||
|
case 'f':
|
||||||
|
return NewFieldRowKV(key, value)
|
||||||
|
case 't':
|
||||||
|
return NewTermFrequencyRowKV(key, value)
|
||||||
|
case 'b':
|
||||||
|
return NewBackIndexRowKV(key, value)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// VERSION
|
||||||
|
|
||||||
|
type VersionRow struct {
|
||||||
|
version uint8
|
||||||
|
}
|
||||||
|
|
||||||
|
func (v *VersionRow) Key() []byte {
|
||||||
|
return []byte{'v'}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (v *VersionRow) Value() []byte {
|
||||||
|
buf := new(bytes.Buffer)
|
||||||
|
err := binary.Write(buf, binary.LittleEndian, v.version)
|
||||||
|
if err != nil {
|
||||||
|
panic(fmt.Sprintf("binary.Write failed: %v", err))
|
||||||
|
}
|
||||||
|
return buf.Bytes()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (v *VersionRow) String() string {
|
||||||
|
return fmt.Sprintf("Version: %d", v.version)
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewVersionRow(version uint8) *VersionRow {
|
||||||
|
return &VersionRow{
|
||||||
|
version: version,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewVersionRowKV(key, value []byte) *VersionRow {
|
||||||
|
rv := VersionRow{}
|
||||||
|
buf := bytes.NewBuffer(value)
|
||||||
|
err := binary.Read(buf, binary.LittleEndian, &rv.version)
|
||||||
|
if err != nil {
|
||||||
|
panic(fmt.Sprintf("binary.Read failed: %v", err))
|
||||||
|
}
|
||||||
|
return &rv
|
||||||
|
}
|
||||||
|
|
||||||
|
// FIELD definition
|
||||||
|
|
||||||
|
type FieldRow struct {
|
||||||
|
index uint16
|
||||||
|
name string
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f *FieldRow) Key() []byte {
|
||||||
|
buf := new(bytes.Buffer)
|
||||||
|
err := buf.WriteByte('f')
|
||||||
|
if err != nil {
|
||||||
|
panic(fmt.Sprintf("Buffer.WriteByte failed: %v", err))
|
||||||
|
}
|
||||||
|
err = binary.Write(buf, binary.LittleEndian, f.index)
|
||||||
|
if err != nil {
|
||||||
|
panic(fmt.Sprintf("binary.Write failed: %v", err))
|
||||||
|
}
|
||||||
|
return buf.Bytes()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f *FieldRow) Value() []byte {
|
||||||
|
buf := new(bytes.Buffer)
|
||||||
|
_, err := buf.WriteString(f.name)
|
||||||
|
if err != nil {
|
||||||
|
panic(fmt.Sprintf("Buffer.WriteString failed: %v", err))
|
||||||
|
}
|
||||||
|
err = buf.WriteByte(BYTE_SEPARATOR)
|
||||||
|
if err != nil {
|
||||||
|
panic(fmt.Sprintf("Buffer.WriteByte failed: %v", err))
|
||||||
|
}
|
||||||
|
return buf.Bytes()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f *FieldRow) String() string {
|
||||||
|
return fmt.Sprintf("Field: %d Name: %s", f.index, f.name)
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewFieldRow(index uint16, name string) *FieldRow {
|
||||||
|
return &FieldRow{
|
||||||
|
index: index,
|
||||||
|
name: name,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewFieldRowKV(key, value []byte) *FieldRow {
|
||||||
|
rv := FieldRow{}
|
||||||
|
|
||||||
|
buf := bytes.NewBuffer(key)
|
||||||
|
buf.ReadByte() // type
|
||||||
|
err := binary.Read(buf, binary.LittleEndian, &rv.index)
|
||||||
|
if err != nil {
|
||||||
|
panic(fmt.Sprintf("binary.Read failed: %v", err))
|
||||||
|
}
|
||||||
|
|
||||||
|
buf = bytes.NewBuffer(value)
|
||||||
|
rv.name, err = buf.ReadString(BYTE_SEPARATOR)
|
||||||
|
if err != nil {
|
||||||
|
panic(fmt.Sprintf("Buffer.ReadString failed: %v", err))
|
||||||
|
}
|
||||||
|
rv.name = rv.name[:len(rv.name)-1] // trim off separator byte
|
||||||
|
|
||||||
|
return &rv
|
||||||
|
}
|
||||||
|
|
||||||
|
// TERM FIELD FREQUENCY
|
||||||
|
|
||||||
|
type TermVector struct {
|
||||||
|
field uint16
|
||||||
|
pos uint64
|
||||||
|
start uint64
|
||||||
|
end uint64
|
||||||
|
}
|
||||||
|
|
||||||
|
func (tv *TermVector) String() string {
|
||||||
|
return fmt.Sprintf("Field: %d Pos: %d Start: %d End %d", tv.field, tv.pos, tv.start, tv.end)
|
||||||
|
}
|
||||||
|
|
||||||
|
type TermFrequencyRow struct {
|
||||||
|
term []byte
|
||||||
|
field uint16
|
||||||
|
doc []byte
|
||||||
|
freq uint64
|
||||||
|
norm float32
|
||||||
|
vectors []*TermVector
|
||||||
|
}
|
||||||
|
|
||||||
|
func (tfr *TermFrequencyRow) Key() []byte {
|
||||||
|
buf := new(bytes.Buffer)
|
||||||
|
err := buf.WriteByte('t')
|
||||||
|
if err != nil {
|
||||||
|
panic(fmt.Sprintf("Buffer.WriteByte failed: %v", err))
|
||||||
|
}
|
||||||
|
_, err = buf.Write(tfr.term)
|
||||||
|
if err != nil {
|
||||||
|
panic(fmt.Sprintf("Buffer.Write failed: %v", err))
|
||||||
|
}
|
||||||
|
err = buf.WriteByte(BYTE_SEPARATOR)
|
||||||
|
if err != nil {
|
||||||
|
panic(fmt.Sprintf("Buffer.WriteByte failed: %v", err))
|
||||||
|
}
|
||||||
|
err = binary.Write(buf, binary.LittleEndian, tfr.field)
|
||||||
|
if err != nil {
|
||||||
|
panic(fmt.Sprintf("binary.Write failed: %v", err))
|
||||||
|
}
|
||||||
|
_, err = buf.Write(tfr.doc)
|
||||||
|
if err != nil {
|
||||||
|
panic(fmt.Sprintf("Buffer.Write failed: %v", err))
|
||||||
|
}
|
||||||
|
return buf.Bytes()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (tfr *TermFrequencyRow) Value() []byte {
|
||||||
|
buf := new(bytes.Buffer)
|
||||||
|
err := binary.Write(buf, binary.LittleEndian, tfr.freq)
|
||||||
|
if err != nil {
|
||||||
|
panic(fmt.Sprintf("binary.Write failed: %v", err))
|
||||||
|
}
|
||||||
|
err = binary.Write(buf, binary.LittleEndian, tfr.norm)
|
||||||
|
if err != nil {
|
||||||
|
panic(fmt.Sprintf("binary.Write failed: %v", err))
|
||||||
|
}
|
||||||
|
for _, vector := range tfr.vectors {
|
||||||
|
err = binary.Write(buf, binary.LittleEndian, vector.field)
|
||||||
|
if err != nil {
|
||||||
|
panic(fmt.Sprintf("binary.Write failed: %v", err))
|
||||||
|
}
|
||||||
|
err = binary.Write(buf, binary.LittleEndian, vector.pos)
|
||||||
|
if err != nil {
|
||||||
|
panic(fmt.Sprintf("binary.Write failed: %v", err))
|
||||||
|
}
|
||||||
|
err = binary.Write(buf, binary.LittleEndian, vector.start)
|
||||||
|
if err != nil {
|
||||||
|
panic(fmt.Sprintf("binary.Write failed: %v", err))
|
||||||
|
}
|
||||||
|
err = binary.Write(buf, binary.LittleEndian, vector.end)
|
||||||
|
if err != nil {
|
||||||
|
panic(fmt.Sprintf("binary.Write failed: %v", err))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return buf.Bytes()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (tfr *TermFrequencyRow) String() string {
|
||||||
|
return fmt.Sprintf("Term: `%s` Field: %d DocId: `%s` Frequency: %d Norm: %f Vectors: %v", string(tfr.term), tfr.field, string(tfr.doc), tfr.freq, tfr.norm, tfr.vectors)
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewTermFrequencyRow(term []byte, field uint16, doc string, freq uint64, norm float32) *TermFrequencyRow {
|
||||||
|
return &TermFrequencyRow{
|
||||||
|
term: term,
|
||||||
|
field: field,
|
||||||
|
doc: []byte(doc),
|
||||||
|
freq: freq,
|
||||||
|
norm: norm,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewTermFrequencyRowWithTermVectors(term []byte, field uint16, doc string, freq uint64, norm float32, vectors []*TermVector) *TermFrequencyRow {
|
||||||
|
return &TermFrequencyRow{
|
||||||
|
term: term,
|
||||||
|
field: field,
|
||||||
|
doc: []byte(doc),
|
||||||
|
freq: freq,
|
||||||
|
norm: norm,
|
||||||
|
vectors: vectors,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewTermFrequencyRowKV(key, value []byte) *TermFrequencyRow {
|
||||||
|
rv := TermFrequencyRow{
|
||||||
|
doc: []byte(""),
|
||||||
|
}
|
||||||
|
buf := bytes.NewBuffer(key)
|
||||||
|
buf.ReadByte() // type
|
||||||
|
|
||||||
|
var err error
|
||||||
|
rv.term, err = buf.ReadBytes(BYTE_SEPARATOR)
|
||||||
|
if err != nil {
|
||||||
|
panic(fmt.Sprintf("Buffer.ReadString failed: %v", err))
|
||||||
|
}
|
||||||
|
rv.term = rv.term[:len(rv.term)-1] // trim off separator byte
|
||||||
|
|
||||||
|
err = binary.Read(buf, binary.LittleEndian, &rv.field)
|
||||||
|
if err != nil {
|
||||||
|
panic(fmt.Sprintf("binary.Read failed: %v", err))
|
||||||
|
}
|
||||||
|
|
||||||
|
doc, err := buf.ReadBytes(BYTE_SEPARATOR)
|
||||||
|
if err != io.EOF {
|
||||||
|
panic(fmt.Sprintf("expected binary.ReadString to end in EOF: %v", err))
|
||||||
|
}
|
||||||
|
if doc != nil {
|
||||||
|
rv.doc = doc
|
||||||
|
}
|
||||||
|
|
||||||
|
buf = bytes.NewBuffer((value))
|
||||||
|
err = binary.Read(buf, binary.LittleEndian, &rv.freq)
|
||||||
|
if err != nil {
|
||||||
|
panic(fmt.Sprintf("binary.Read failed: %v", err))
|
||||||
|
}
|
||||||
|
err = binary.Read(buf, binary.LittleEndian, &rv.norm)
|
||||||
|
if err != nil {
|
||||||
|
panic(fmt.Sprintf("binary.Read failed: %v", err))
|
||||||
|
}
|
||||||
|
|
||||||
|
var field uint16
|
||||||
|
err = binary.Read(buf, binary.LittleEndian, &field)
|
||||||
|
if err != nil && err != io.EOF {
|
||||||
|
panic(fmt.Sprintf("binary.Read failed: %v", err))
|
||||||
|
}
|
||||||
|
for err != io.EOF {
|
||||||
|
tv := TermVector{}
|
||||||
|
tv.field = field
|
||||||
|
// at this point we expect at least one term vector
|
||||||
|
if rv.vectors == nil {
|
||||||
|
rv.vectors = make([]*TermVector, 0)
|
||||||
|
}
|
||||||
|
|
||||||
|
err = binary.Read(buf, binary.LittleEndian, &tv.pos)
|
||||||
|
if err != nil {
|
||||||
|
panic(fmt.Sprintf("binary.Read failed: %v", err))
|
||||||
|
}
|
||||||
|
err = binary.Read(buf, binary.LittleEndian, &tv.start)
|
||||||
|
if err != nil {
|
||||||
|
panic(fmt.Sprintf("binary.Read failed: %v", err))
|
||||||
|
}
|
||||||
|
err = binary.Read(buf, binary.LittleEndian, &tv.end)
|
||||||
|
if err != nil {
|
||||||
|
panic(fmt.Sprintf("binary.Read failed: %v", err))
|
||||||
|
}
|
||||||
|
rv.vectors = append(rv.vectors, &tv)
|
||||||
|
// try to read next record (may not exist)
|
||||||
|
err = binary.Read(buf, binary.LittleEndian, &field)
|
||||||
|
}
|
||||||
|
|
||||||
|
return &rv
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
type BackIndexEntry struct {
|
||||||
|
term []byte
|
||||||
|
field uint16
|
||||||
|
}
|
||||||
|
|
||||||
|
func (bie *BackIndexEntry) String() string {
|
||||||
|
return fmt.Sprintf("Term: `%s` Field: %d", string(bie.term), bie.field)
|
||||||
|
}
|
||||||
|
|
||||||
|
type BackIndexRow struct {
|
||||||
|
doc []byte
|
||||||
|
entries []*BackIndexEntry
|
||||||
|
}
|
||||||
|
|
||||||
|
func (br *BackIndexRow) Key() []byte {
|
||||||
|
buf := new(bytes.Buffer)
|
||||||
|
err := buf.WriteByte('b')
|
||||||
|
if err != nil {
|
||||||
|
panic(fmt.Sprintf("Buffer.WriteByte failed: %v", err))
|
||||||
|
}
|
||||||
|
err = binary.Write(buf, binary.LittleEndian, br.doc)
|
||||||
|
if err != nil {
|
||||||
|
panic(fmt.Sprintf("binary.Write failed: %v", err))
|
||||||
|
}
|
||||||
|
return buf.Bytes()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (br *BackIndexRow) Value() []byte {
|
||||||
|
buf := new(bytes.Buffer)
|
||||||
|
for _, e := range br.entries {
|
||||||
|
_, err := buf.Write(e.term)
|
||||||
|
if err != nil {
|
||||||
|
panic(fmt.Sprintf("Buffer.Write failed: %v", err))
|
||||||
|
}
|
||||||
|
err = buf.WriteByte(BYTE_SEPARATOR)
|
||||||
|
if err != nil {
|
||||||
|
panic(fmt.Sprintf("Buffer.WriteByte failed: %v", err))
|
||||||
|
}
|
||||||
|
err = binary.Write(buf, binary.LittleEndian, e.field)
|
||||||
|
if err != nil {
|
||||||
|
panic(fmt.Sprintf("binary.Write failed: %v", err))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return buf.Bytes()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (br *BackIndexRow) String() string {
|
||||||
|
return fmt.Sprintf("Backindex DocId: `%s` Entries: %v", string(br.doc), br.entries)
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewBackIndexRow(doc string, entries []*BackIndexEntry) *BackIndexRow {
|
||||||
|
return &BackIndexRow{
|
||||||
|
doc: []byte(doc),
|
||||||
|
entries: entries,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewBackIndexRowKV(key, value []byte) *BackIndexRow {
|
||||||
|
rv := BackIndexRow{}
|
||||||
|
|
||||||
|
buf := bytes.NewBuffer(key)
|
||||||
|
buf.ReadByte() // type
|
||||||
|
|
||||||
|
var err error
|
||||||
|
rv.doc, err = buf.ReadBytes(BYTE_SEPARATOR)
|
||||||
|
if err != io.EOF {
|
||||||
|
panic(fmt.Sprintf("expected binary.ReadString to end in EOF: %v", err))
|
||||||
|
}
|
||||||
|
|
||||||
|
buf = bytes.NewBuffer(value)
|
||||||
|
rv.entries = make([]*BackIndexEntry, 0)
|
||||||
|
|
||||||
|
var term []byte
|
||||||
|
term, err = buf.ReadBytes(BYTE_SEPARATOR)
|
||||||
|
if err != nil && err != io.EOF {
|
||||||
|
panic(fmt.Sprintf("Buffer.ReadString failed: %v", err))
|
||||||
|
}
|
||||||
|
for err != io.EOF {
|
||||||
|
ent := BackIndexEntry{}
|
||||||
|
ent.term = term[:len(term)-1] // trim off separator byte
|
||||||
|
|
||||||
|
err = binary.Read(buf, binary.LittleEndian, &ent.field)
|
||||||
|
if err != nil {
|
||||||
|
panic(fmt.Sprintf("binary.Read failed: %v", err))
|
||||||
|
}
|
||||||
|
rv.entries = append(rv.entries, &ent)
|
||||||
|
|
||||||
|
term, err = buf.ReadBytes(BYTE_SEPARATOR)
|
||||||
|
if err != nil && err != io.EOF {
|
||||||
|
panic(fmt.Sprintf("Buffer.ReadString failed: %v", err))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return &rv
|
||||||
|
}
|
|
@ -0,0 +1,89 @@
|
||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
package upside_down
|
||||||
|
|
||||||
|
import (
|
||||||
|
"reflect"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestRows(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
input UpsideDownCouchRow
|
||||||
|
outKey []byte
|
||||||
|
outVal []byte
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
NewVersionRow(1),
|
||||||
|
[]byte{'v'},
|
||||||
|
[]byte{0x1},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
NewFieldRow(0, "name"),
|
||||||
|
[]byte{'f', 0, 0},
|
||||||
|
[]byte{'n', 'a', 'm', 'e', BYTE_SEPARATOR},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
NewFieldRow(1, "desc"),
|
||||||
|
[]byte{'f', 1, 0},
|
||||||
|
[]byte{'d', 'e', 's', 'c', BYTE_SEPARATOR},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
NewFieldRow(513, "style"),
|
||||||
|
[]byte{'f', 1, 2},
|
||||||
|
[]byte{'s', 't', 'y', 'l', 'e', BYTE_SEPARATOR},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
NewTermFrequencyRow([]byte{'b', 'e', 'e', 'r'}, 0, "", 3, 3.14),
|
||||||
|
[]byte{'t', 'b', 'e', 'e', 'r', BYTE_SEPARATOR, 0, 0},
|
||||||
|
[]byte{3, 0, 0, 0, 0, 0, 0, 0, 195, 245, 72, 64},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
NewTermFrequencyRow([]byte{'b', 'e', 'e', 'r'}, 0, "budweiser", 3, 3.14),
|
||||||
|
[]byte{'t', 'b', 'e', 'e', 'r', BYTE_SEPARATOR, 0, 0, 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'},
|
||||||
|
[]byte{3, 0, 0, 0, 0, 0, 0, 0, 195, 245, 72, 64},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
NewTermFrequencyRowWithTermVectors([]byte{'b', 'e', 'e', 'r'}, 0, "budweiser", 3, 3.14, []*TermVector{&TermVector{field: 0, pos: 1, start: 3, end: 11}, &TermVector{field: 0, pos: 2, start: 23, end: 31}, &TermVector{field: 0, pos: 3, start: 43, end: 51}}),
|
||||||
|
[]byte{'t', 'b', 'e', 'e', 'r', BYTE_SEPARATOR, 0, 0, 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'},
|
||||||
|
[]byte{3, 0, 0, 0, 0, 0, 0, 0, 195, 245, 72, 64, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 23, 0, 0, 0, 0, 0, 0, 0, 31, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 43, 0, 0, 0, 0, 0, 0, 0, 51, 0, 0, 0, 0, 0, 0, 0},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
NewBackIndexRow("budweiser", []*BackIndexEntry{&BackIndexEntry{[]byte{'b', 'e', 'e', 'r'}, 0}}),
|
||||||
|
[]byte{'b', 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'},
|
||||||
|
[]byte{'b', 'e', 'e', 'r', BYTE_SEPARATOR, 0, 0},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
NewBackIndexRow("budweiser", []*BackIndexEntry{&BackIndexEntry{[]byte{'b', 'e', 'e', 'r'}, 0}, &BackIndexEntry{[]byte{'b', 'e', 'a', 't'}, 1}}),
|
||||||
|
[]byte{'b', 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'},
|
||||||
|
[]byte{'b', 'e', 'e', 'r', BYTE_SEPARATOR, 0, 0, 'b', 'e', 'a', 't', BYTE_SEPARATOR, 1, 0},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
// test going from struct to k/v bytes
|
||||||
|
for _, test := range tests {
|
||||||
|
rk := test.input.Key()
|
||||||
|
if !reflect.DeepEqual(rk, test.outKey) {
|
||||||
|
t.Errorf("Expected key to be %v got: %v", test.outKey, rk)
|
||||||
|
}
|
||||||
|
rv := test.input.Value()
|
||||||
|
if !reflect.DeepEqual(rv, test.outVal) {
|
||||||
|
t.Errorf("Expected value to be %v got: %v", test.outVal, rv)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// now test going back from k/v bytes to struct
|
||||||
|
for _, test := range tests {
|
||||||
|
row := ParseFromKeyValue(test.outKey, test.outVal)
|
||||||
|
if !reflect.DeepEqual(row, test.input) {
|
||||||
|
t.Fatalf("Expected: %#v got: %#v", test.input, row)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,466 @@
|
||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
package upside_down
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"fmt"
|
||||||
|
"log"
|
||||||
|
"math"
|
||||||
|
|
||||||
|
"github.com/couchbaselabs/bleve/analysis"
|
||||||
|
"github.com/jmhodges/levigo"
|
||||||
|
|
||||||
|
"github.com/couchbaselabs/bleve/document"
|
||||||
|
"github.com/couchbaselabs/bleve/index"
|
||||||
|
)
|
||||||
|
|
||||||
|
var VERSION_KEY []byte = []byte{'v'}
|
||||||
|
|
||||||
|
const VERSION uint8 = 1
|
||||||
|
|
||||||
|
type UpsideDownCouch struct {
|
||||||
|
version uint8
|
||||||
|
path string
|
||||||
|
opts *levigo.Options
|
||||||
|
db *levigo.DB
|
||||||
|
fieldIndexes map[string]uint16
|
||||||
|
lastFieldIndex int
|
||||||
|
analyzer map[string]*analysis.Analyzer
|
||||||
|
docCount uint64
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewUpsideDownCouch(path string) *UpsideDownCouch {
|
||||||
|
opts := levigo.NewOptions()
|
||||||
|
opts.SetCreateIfMissing(true)
|
||||||
|
|
||||||
|
return &UpsideDownCouch{
|
||||||
|
version: VERSION,
|
||||||
|
path: path,
|
||||||
|
opts: opts,
|
||||||
|
analyzer: make(map[string]*analysis.Analyzer),
|
||||||
|
fieldIndexes: make(map[string]uint16),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (udc *UpsideDownCouch) init() (err error) {
|
||||||
|
// prepare a list of rows
|
||||||
|
rows := make([]UpsideDownCouchRow, 0)
|
||||||
|
|
||||||
|
// version marker
|
||||||
|
rows = append(rows, NewVersionRow(udc.version))
|
||||||
|
|
||||||
|
return udc.batchRows(nil, rows, nil)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (udc *UpsideDownCouch) loadSchema() (err error) {
|
||||||
|
// schema := make([]*index.Field, 0)
|
||||||
|
|
||||||
|
ro := defaultReadOptions()
|
||||||
|
it := udc.db.NewIterator(ro)
|
||||||
|
defer it.Close()
|
||||||
|
|
||||||
|
keyPrefix := []byte{'f'}
|
||||||
|
it.Seek(keyPrefix)
|
||||||
|
for it = it; it.Valid(); it.Next() {
|
||||||
|
// stop when
|
||||||
|
if !bytes.HasPrefix(it.Key(), keyPrefix) {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
fieldRow := NewFieldRowKV(it.Key(), it.Value())
|
||||||
|
udc.fieldIndexes[fieldRow.name] = fieldRow.index
|
||||||
|
if int(fieldRow.index) > udc.lastFieldIndex {
|
||||||
|
udc.lastFieldIndex = int(fieldRow.index)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
err = it.GetError()
|
||||||
|
if err != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
func (udc *UpsideDownCouch) batchRows(addRows []UpsideDownCouchRow, updateRows []UpsideDownCouchRow, deleteRows []UpsideDownCouchRow) (err error) {
|
||||||
|
ro := defaultReadOptions()
|
||||||
|
|
||||||
|
// prepare batch
|
||||||
|
wb := levigo.NewWriteBatch()
|
||||||
|
|
||||||
|
// add
|
||||||
|
for _, row := range addRows {
|
||||||
|
tfr, ok := row.(*TermFrequencyRow)
|
||||||
|
if ok {
|
||||||
|
// need to increment counter
|
||||||
|
tr := NewTermFrequencyRow(tfr.term, tfr.field, "", 0, 0)
|
||||||
|
val, err := udc.db.Get(ro, tr.Key())
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if val != nil {
|
||||||
|
tr = ParseFromKeyValue(tr.Key(), val).(*TermFrequencyRow)
|
||||||
|
tr.freq += 1 // incr
|
||||||
|
} else {
|
||||||
|
tr = NewTermFrequencyRow(tfr.term, tfr.field, "", 1, 0)
|
||||||
|
}
|
||||||
|
|
||||||
|
// now add this to the batch
|
||||||
|
wb.Put(tr.Key(), tr.Value())
|
||||||
|
}
|
||||||
|
wb.Put(row.Key(), row.Value())
|
||||||
|
}
|
||||||
|
|
||||||
|
// update
|
||||||
|
for _, row := range updateRows {
|
||||||
|
wb.Put(row.Key(), row.Value())
|
||||||
|
}
|
||||||
|
|
||||||
|
// delete
|
||||||
|
for _, row := range deleteRows {
|
||||||
|
tfr, ok := row.(*TermFrequencyRow)
|
||||||
|
if ok {
|
||||||
|
// need to decrement counter
|
||||||
|
tr := NewTermFrequencyRow(tfr.term, tfr.field, "", 0, 0)
|
||||||
|
val, err := udc.db.Get(ro, tr.Key())
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if val != nil {
|
||||||
|
tr = ParseFromKeyValue(tr.Key(), val).(*TermFrequencyRow)
|
||||||
|
tr.freq -= 1 // incr
|
||||||
|
} else {
|
||||||
|
log.Panic(fmt.Sprintf("unexpected missing row, deleting term, expected count row to exit: %v", tr.Key()))
|
||||||
|
}
|
||||||
|
|
||||||
|
if tr.freq == 0 {
|
||||||
|
wb.Delete(tr.Key())
|
||||||
|
} else {
|
||||||
|
// now add this to the batch
|
||||||
|
wb.Put(tr.Key(), tr.Value())
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
wb.Delete(row.Key())
|
||||||
|
}
|
||||||
|
|
||||||
|
// write out the batch
|
||||||
|
wo := defaultWriteOptions()
|
||||||
|
err = udc.db.Write(wo, wb)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
func (udc *UpsideDownCouch) DocCount() uint64 {
|
||||||
|
return udc.docCount
|
||||||
|
}
|
||||||
|
|
||||||
|
func (udc *UpsideDownCouch) Open() (err error) {
|
||||||
|
udc.db, err = levigo.Open(udc.path, udc.opts)
|
||||||
|
if err != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
ro := defaultReadOptions()
|
||||||
|
var value []byte
|
||||||
|
value, err = udc.db.Get(ro, VERSION_KEY)
|
||||||
|
if err != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// init new index OR load schema
|
||||||
|
if value == nil {
|
||||||
|
err = udc.init()
|
||||||
|
if err != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
err = udc.loadSchema()
|
||||||
|
if err != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// set doc count
|
||||||
|
udc.docCount = udc.countDocs()
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
func (udc *UpsideDownCouch) countDocs() uint64 {
|
||||||
|
ro := defaultReadOptions()
|
||||||
|
ro.SetFillCache(false) // dont fill the cache with this
|
||||||
|
it := udc.db.NewIterator(ro)
|
||||||
|
defer it.Close()
|
||||||
|
|
||||||
|
// begining of back index
|
||||||
|
it.Seek([]byte{'b'})
|
||||||
|
|
||||||
|
var rv uint64 = 0
|
||||||
|
for it = it; it.Valid(); it.Next() {
|
||||||
|
if !bytes.HasPrefix(it.Key(), []byte{'b'}) {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
rv += 1
|
||||||
|
}
|
||||||
|
return rv
|
||||||
|
}
|
||||||
|
|
||||||
|
func (udc *UpsideDownCouch) rowCount() uint64 {
|
||||||
|
ro := defaultReadOptions()
|
||||||
|
ro.SetFillCache(false) // dont fill the cache with this
|
||||||
|
it := udc.db.NewIterator(ro)
|
||||||
|
defer it.Close()
|
||||||
|
|
||||||
|
it.Seek([]byte{0})
|
||||||
|
|
||||||
|
var rv uint64 = 0
|
||||||
|
for it = it; it.Valid(); it.Next() {
|
||||||
|
rv += 1
|
||||||
|
}
|
||||||
|
return rv
|
||||||
|
}
|
||||||
|
|
||||||
|
func (udc *UpsideDownCouch) Close() {
|
||||||
|
udc.db.Close()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (udc *UpsideDownCouch) Update(doc *document.Document) error {
|
||||||
|
// first we lookup the backindex row for the doc id if it exists
|
||||||
|
// lookup the back index row
|
||||||
|
backIndexRow, err := udc.backIndexRowForDoc(doc.ID)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
var isAdd = true
|
||||||
|
// a map for each field, map key is term (string) bool true for existence
|
||||||
|
// FIMXE hard-coded to max of 256 fields
|
||||||
|
existingTermFieldMaps := make([]map[string]bool, 256)
|
||||||
|
if backIndexRow != nil {
|
||||||
|
isAdd = false
|
||||||
|
for _, entry := range backIndexRow.entries {
|
||||||
|
existingTermFieldMap := existingTermFieldMaps[entry.field]
|
||||||
|
if existingTermFieldMap == nil {
|
||||||
|
existingTermFieldMap = make(map[string]bool, 0)
|
||||||
|
existingTermFieldMaps[entry.field] = existingTermFieldMap
|
||||||
|
}
|
||||||
|
existingTermFieldMap[string(entry.term)] = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// prepare a list of rows
|
||||||
|
updateRows := make([]UpsideDownCouchRow, 0)
|
||||||
|
addRows := make([]UpsideDownCouchRow, 0)
|
||||||
|
|
||||||
|
// track our back index entries
|
||||||
|
backIndexEntries := make([]*BackIndexEntry, 0)
|
||||||
|
|
||||||
|
for _, field := range doc.Fields {
|
||||||
|
fieldIndex, fieldExists := udc.fieldIndexes[field.Name]
|
||||||
|
if !fieldExists {
|
||||||
|
// assign next field id
|
||||||
|
fieldIndex = uint16(udc.lastFieldIndex + 1)
|
||||||
|
udc.fieldIndexes[field.Name] = fieldIndex
|
||||||
|
// ensure this batch adds a row for this field
|
||||||
|
row := NewFieldRow(uint16(fieldIndex), field.Name)
|
||||||
|
updateRows = append(updateRows, row)
|
||||||
|
udc.lastFieldIndex = int(fieldIndex)
|
||||||
|
}
|
||||||
|
|
||||||
|
existingTermFieldMap := existingTermFieldMaps[fieldIndex]
|
||||||
|
|
||||||
|
analyzer := field.Analyzer
|
||||||
|
tokens := analyzer.Analyze(field.Value)
|
||||||
|
fieldLength := len(tokens) // number of tokens in this doc field
|
||||||
|
fieldNorm := float32(1.0 / math.Sqrt(float64(fieldLength)))
|
||||||
|
tokenFreqs := analysis.TokenFrequency(tokens)
|
||||||
|
for _, tf := range tokenFreqs {
|
||||||
|
var termFreqRow *TermFrequencyRow
|
||||||
|
if document.IncludeTermVectors(field.IndexingOptions) {
|
||||||
|
tv := termVectorsFromTokenFreq(uint16(fieldIndex), tf)
|
||||||
|
termFreqRow = NewTermFrequencyRowWithTermVectors(tf.Term, uint16(fieldIndex), doc.ID, uint64(frequencyFromTokenFreq(tf)), fieldNorm, tv)
|
||||||
|
} else {
|
||||||
|
termFreqRow = NewTermFrequencyRow(tf.Term, uint16(fieldIndex), doc.ID, uint64(frequencyFromTokenFreq(tf)), fieldNorm)
|
||||||
|
}
|
||||||
|
|
||||||
|
// record the back index entry
|
||||||
|
backIndexEntry := BackIndexEntry{tf.Term, uint16(fieldIndex)}
|
||||||
|
backIndexEntries = append(backIndexEntries, &backIndexEntry)
|
||||||
|
|
||||||
|
// remove the entry from the map of existing term fields if it exists
|
||||||
|
if existingTermFieldMap != nil {
|
||||||
|
termString := string(tf.Term)
|
||||||
|
_, ok := existingTermFieldMap[termString]
|
||||||
|
if ok {
|
||||||
|
// this is an update
|
||||||
|
updateRows = append(updateRows, termFreqRow)
|
||||||
|
// this term existed last time, delete it from that map
|
||||||
|
delete(existingTermFieldMap, termString)
|
||||||
|
} else {
|
||||||
|
// this is an add
|
||||||
|
addRows = append(addRows, termFreqRow)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// this is an add
|
||||||
|
addRows = append(addRows, termFreqRow)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
// build the back index row
|
||||||
|
backIndexRow = NewBackIndexRow(doc.ID, backIndexEntries)
|
||||||
|
updateRows = append(updateRows, backIndexRow)
|
||||||
|
|
||||||
|
// any of the existing rows that weren't updated need to be deleted
|
||||||
|
deleteRows := make([]UpsideDownCouchRow, 0)
|
||||||
|
for fieldIndex, existingTermFieldMap := range existingTermFieldMaps {
|
||||||
|
if existingTermFieldMap != nil {
|
||||||
|
for termString, _ := range existingTermFieldMap {
|
||||||
|
termFreqRow := NewTermFrequencyRow([]byte(termString), uint16(fieldIndex), doc.ID, 0, 0)
|
||||||
|
deleteRows = append(deleteRows, termFreqRow)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
err = udc.batchRows(addRows, updateRows, deleteRows)
|
||||||
|
if err == nil && isAdd {
|
||||||
|
udc.docCount += 1
|
||||||
|
}
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
func (udc *UpsideDownCouch) Delete(id string) error {
|
||||||
|
// lookup the back index row
|
||||||
|
backIndexRow, err := udc.backIndexRowForDoc(id)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if backIndexRow == nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// prepare a list of rows to delete
|
||||||
|
rows := make([]UpsideDownCouchRow, 0)
|
||||||
|
for _, backIndexEntry := range backIndexRow.entries {
|
||||||
|
tfr := NewTermFrequencyRow(backIndexEntry.term, backIndexEntry.field, id, 0, 0)
|
||||||
|
rows = append(rows, tfr)
|
||||||
|
}
|
||||||
|
|
||||||
|
// also delete the back entry itself
|
||||||
|
rows = append(rows, backIndexRow)
|
||||||
|
|
||||||
|
err = udc.batchRows(nil, nil, rows)
|
||||||
|
if err == nil {
|
||||||
|
udc.docCount -= 1
|
||||||
|
}
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
func (udc *UpsideDownCouch) backIndexRowForDoc(docId string) (*BackIndexRow, error) {
|
||||||
|
ro := defaultReadOptions()
|
||||||
|
|
||||||
|
// use a temporary row structure to build key
|
||||||
|
tempRow := &BackIndexRow{
|
||||||
|
doc: []byte(docId),
|
||||||
|
}
|
||||||
|
key := tempRow.Key()
|
||||||
|
value, err := udc.db.Get(ro, key)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
if value == nil {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
backIndexRow := ParseFromKeyValue(key, value).(*BackIndexRow)
|
||||||
|
return backIndexRow, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (udc *UpsideDownCouch) Dump() {
|
||||||
|
ro := defaultReadOptions()
|
||||||
|
ro.SetFillCache(false)
|
||||||
|
it := udc.db.NewIterator(ro)
|
||||||
|
defer it.Close()
|
||||||
|
it.SeekToFirst()
|
||||||
|
for it = it; it.Valid(); it.Next() {
|
||||||
|
//fmt.Printf("Key: `%v` Value: `%v`\n", string(it.Key()), string(it.Value()))
|
||||||
|
row := ParseFromKeyValue(it.Key(), it.Value())
|
||||||
|
if row != nil {
|
||||||
|
fmt.Printf("%v\n", row)
|
||||||
|
fmt.Printf("Key: % -100x\nValue: % -100x\n\n", it.Key(), it.Value())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
err := it.GetError()
|
||||||
|
if err != nil {
|
||||||
|
fmt.Printf("Error reading iterator: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (udc *UpsideDownCouch) TermFieldReader(term []byte, fieldName string) (index.TermFieldReader, error) {
|
||||||
|
fieldIndex, fieldExists := udc.fieldIndexes[fieldName]
|
||||||
|
if fieldExists {
|
||||||
|
return newUpsideDownCouchTermFieldReader(udc, term, uint16(fieldIndex))
|
||||||
|
}
|
||||||
|
log.Printf("fields: %v", udc.fieldIndexes)
|
||||||
|
return nil, fmt.Errorf("No field named `%s` in the schema", fieldName)
|
||||||
|
}
|
||||||
|
|
||||||
|
func defaultWriteOptions() *levigo.WriteOptions {
|
||||||
|
wo := levigo.NewWriteOptions()
|
||||||
|
// request fsync on write for safety
|
||||||
|
wo.SetSync(true)
|
||||||
|
return wo
|
||||||
|
}
|
||||||
|
|
||||||
|
func defaultReadOptions() *levigo.ReadOptions {
|
||||||
|
ro := levigo.NewReadOptions()
|
||||||
|
return ro
|
||||||
|
}
|
||||||
|
|
||||||
|
func frequencyFromTokenFreq(tf *analysis.TokenFreq) int {
|
||||||
|
return len(tf.Locations)
|
||||||
|
}
|
||||||
|
|
||||||
|
func termVectorsFromTokenFreq(field uint16, tf *analysis.TokenFreq) []*TermVector {
|
||||||
|
rv := make([]*TermVector, len(tf.Locations))
|
||||||
|
|
||||||
|
for i, l := range tf.Locations {
|
||||||
|
tv := TermVector{
|
||||||
|
field: field,
|
||||||
|
pos: uint64(l.Position),
|
||||||
|
start: uint64(l.Start),
|
||||||
|
end: uint64(l.End),
|
||||||
|
}
|
||||||
|
rv[i] = &tv
|
||||||
|
}
|
||||||
|
|
||||||
|
return rv
|
||||||
|
}
|
||||||
|
|
||||||
|
func (udc *UpsideDownCouch) termFieldVectorsFromTermVectors(in []*TermVector) []*index.TermFieldVector {
|
||||||
|
rv := make([]*index.TermFieldVector, len(in))
|
||||||
|
|
||||||
|
for i, tv := range in {
|
||||||
|
fieldName := udc.fieldIndexToName(tv.field)
|
||||||
|
tfv := index.TermFieldVector{
|
||||||
|
Field: fieldName,
|
||||||
|
Pos: tv.pos,
|
||||||
|
Start: tv.start,
|
||||||
|
End: tv.end,
|
||||||
|
}
|
||||||
|
rv[i] = &tfv
|
||||||
|
}
|
||||||
|
return rv
|
||||||
|
}
|
||||||
|
|
||||||
|
func (udc *UpsideDownCouch) fieldIndexToName(i uint16) string {
|
||||||
|
for fieldName, fieldIndex := range udc.fieldIndexes {
|
||||||
|
if i == fieldIndex {
|
||||||
|
return fieldName
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ""
|
||||||
|
}
|
|
@ -0,0 +1,221 @@
|
||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
package upside_down
|
||||||
|
|
||||||
|
import (
|
||||||
|
"os"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
_ "github.com/couchbaselabs/bleve/analysis/analyzers/standard_analyzer"
|
||||||
|
"github.com/couchbaselabs/bleve/document"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestIndexOpenReopen(t *testing.T) {
|
||||||
|
defer os.RemoveAll("test")
|
||||||
|
|
||||||
|
idx := NewUpsideDownCouch("test")
|
||||||
|
err := idx.Open()
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf("error opening index: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
var expectedCount uint64 = 0
|
||||||
|
docCount := idx.DocCount()
|
||||||
|
if docCount != expectedCount {
|
||||||
|
t.Errorf("Expected document count to be %d got %d", expectedCount, docCount)
|
||||||
|
}
|
||||||
|
|
||||||
|
// opening database should have inserted version
|
||||||
|
expectedLength := uint64(1)
|
||||||
|
rowCount := idx.rowCount()
|
||||||
|
if rowCount != expectedLength {
|
||||||
|
t.Errorf("expected %d rows, got: %d", expectedLength, rowCount)
|
||||||
|
}
|
||||||
|
|
||||||
|
// now close it
|
||||||
|
idx.Close()
|
||||||
|
|
||||||
|
idx = NewUpsideDownCouch("test")
|
||||||
|
err = idx.Open()
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf("error opening index: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// now close it
|
||||||
|
idx.Close()
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestIndexInsert(t *testing.T) {
|
||||||
|
defer os.RemoveAll("test")
|
||||||
|
|
||||||
|
idx := NewUpsideDownCouch("test")
|
||||||
|
|
||||||
|
err := idx.Open()
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf("error opening index: %v", err)
|
||||||
|
}
|
||||||
|
defer idx.Close()
|
||||||
|
|
||||||
|
var expectedCount uint64 = 0
|
||||||
|
docCount := idx.DocCount()
|
||||||
|
if docCount != expectedCount {
|
||||||
|
t.Errorf("Expected document count to be %d got %d", expectedCount, docCount)
|
||||||
|
}
|
||||||
|
|
||||||
|
doc := document.NewDocument("1")
|
||||||
|
doc.AddField(document.NewTextField("name", []byte("test")))
|
||||||
|
err = idx.Update(doc)
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf("Error updating index: %v", err)
|
||||||
|
}
|
||||||
|
expectedCount += 1
|
||||||
|
|
||||||
|
docCount = idx.DocCount()
|
||||||
|
if docCount != expectedCount {
|
||||||
|
t.Errorf("Expected document count to be %d got %d", expectedCount, docCount)
|
||||||
|
}
|
||||||
|
|
||||||
|
// should have 4 rows (1 for version, 1 for schema field, and 1 for single term, and 1 for the term count, and 1 for the back index entry)
|
||||||
|
expectedLength := uint64(1 + 1 + 1 + 1 + 1)
|
||||||
|
rowCount := idx.rowCount()
|
||||||
|
if rowCount != expectedLength {
|
||||||
|
t.Errorf("expected %d rows, got: %d", expectedLength, rowCount)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestIndexInsertThenDelete(t *testing.T) {
|
||||||
|
defer os.RemoveAll("test")
|
||||||
|
|
||||||
|
idx := NewUpsideDownCouch("test")
|
||||||
|
|
||||||
|
err := idx.Open()
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf("error opening index: %v", err)
|
||||||
|
}
|
||||||
|
defer idx.Close()
|
||||||
|
|
||||||
|
var expectedCount uint64 = 0
|
||||||
|
docCount := idx.DocCount()
|
||||||
|
if docCount != expectedCount {
|
||||||
|
t.Errorf("Expected document count to be %d got %d", expectedCount, docCount)
|
||||||
|
}
|
||||||
|
|
||||||
|
doc := document.NewDocument("1")
|
||||||
|
doc.AddField(document.NewTextField("name", []byte("test")))
|
||||||
|
err = idx.Update(doc)
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf("Error updating index: %v", err)
|
||||||
|
}
|
||||||
|
expectedCount += 1
|
||||||
|
|
||||||
|
docCount = idx.DocCount()
|
||||||
|
if docCount != expectedCount {
|
||||||
|
t.Errorf("Expected document count to be %d got %d", expectedCount, docCount)
|
||||||
|
}
|
||||||
|
|
||||||
|
err = idx.Delete("1")
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf("Error deleting entry from index: %v", err)
|
||||||
|
}
|
||||||
|
expectedCount -= 1
|
||||||
|
|
||||||
|
docCount = idx.DocCount()
|
||||||
|
if docCount != expectedCount {
|
||||||
|
t.Errorf("Expected document count to be %d got %d", expectedCount, docCount)
|
||||||
|
}
|
||||||
|
|
||||||
|
// should have 2 row (1 for version, 1 for schema field)
|
||||||
|
expectedLength := uint64(1 + 1)
|
||||||
|
rowCount := idx.rowCount()
|
||||||
|
if rowCount != expectedLength {
|
||||||
|
t.Errorf("expected %d rows, got: %d", expectedLength, rowCount)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestIndexInsertThenUpdate(t *testing.T) {
|
||||||
|
defer os.RemoveAll("test")
|
||||||
|
|
||||||
|
idx := NewUpsideDownCouch("test")
|
||||||
|
|
||||||
|
err := idx.Open()
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf("error opening index: %v", err)
|
||||||
|
}
|
||||||
|
defer idx.Close()
|
||||||
|
|
||||||
|
doc := document.NewDocument("1")
|
||||||
|
doc.AddField(document.NewTextField("name", []byte("test")))
|
||||||
|
err = idx.Update(doc)
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf("Error updating index: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// this update should overwrite one term, and introduce one new one
|
||||||
|
doc = document.NewDocument("1")
|
||||||
|
doc.AddField(document.NewTextField("name", []byte("test fail")))
|
||||||
|
err = idx.Update(doc)
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf("Error deleting entry from index: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// should have 2 row (1 for version, 1 for schema field, and 2 for the two term, and 2 for the term counts, and 1 for the back index entry)
|
||||||
|
expectedLength := uint64(1 + 1 + 2 + 2 + 1)
|
||||||
|
rowCount := idx.rowCount()
|
||||||
|
if rowCount != expectedLength {
|
||||||
|
t.Errorf("expected %d rows, got: %d", expectedLength, rowCount)
|
||||||
|
}
|
||||||
|
|
||||||
|
// now do another update that should remove one of term
|
||||||
|
doc = document.NewDocument("1")
|
||||||
|
doc.AddField(document.NewTextField("name", []byte("fail")))
|
||||||
|
err = idx.Update(doc)
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf("Error deleting entry from index: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// should have 2 row (1 for version, 1 for schema field, and 1 for the remaining term, and 1 for the term count, and 1 for the back index entry)
|
||||||
|
expectedLength = uint64(1 + 1 + 1 + 1 + 1)
|
||||||
|
rowCount = idx.rowCount()
|
||||||
|
if rowCount != expectedLength {
|
||||||
|
t.Errorf("expected %d rows, got: %d", expectedLength, rowCount)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestIndexInsertMultiple(t *testing.T) {
|
||||||
|
defer os.RemoveAll("test")
|
||||||
|
|
||||||
|
idx := NewUpsideDownCouch("test")
|
||||||
|
|
||||||
|
err := idx.Open()
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf("error opening index: %v", err)
|
||||||
|
}
|
||||||
|
defer idx.Close()
|
||||||
|
|
||||||
|
doc := document.NewDocument("1")
|
||||||
|
doc.AddField(document.NewTextField("name", []byte("test")))
|
||||||
|
err = idx.Update(doc)
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf("Error updating index: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
doc = document.NewDocument("2")
|
||||||
|
doc.AddField(document.NewTextField("name", []byte("test")))
|
||||||
|
err = idx.Update(doc)
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf("Error updating index: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// should have 4 rows (1 for version, 1 for schema field, and 2 for single term, and 1 for the term count, and 2 for the back index entries)
|
||||||
|
expectedLength := uint64(1 + 1 + 2 + 1 + 2)
|
||||||
|
rowCount := idx.rowCount()
|
||||||
|
if rowCount != expectedLength {
|
||||||
|
t.Errorf("expected %d rows, got: %d", expectedLength, rowCount)
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,21 @@
|
||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
package search
|
||||||
|
|
||||||
|
import (
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
type Collector interface {
|
||||||
|
Collect(searcher Searcher) error
|
||||||
|
Results() DocumentMatchCollection
|
||||||
|
Total() uint64
|
||||||
|
MaxScore() float64
|
||||||
|
Took() time.Duration
|
||||||
|
}
|
|
@ -0,0 +1,96 @@
|
||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
package search
|
||||||
|
|
||||||
|
import (
|
||||||
|
"container/list"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
type TopScoreCollector struct {
|
||||||
|
k int
|
||||||
|
results *list.List
|
||||||
|
took time.Duration
|
||||||
|
maxScore float64
|
||||||
|
total uint64
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewTopScorerCollector(k int) *TopScoreCollector {
|
||||||
|
return &TopScoreCollector{
|
||||||
|
k: k,
|
||||||
|
results: list.New(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (tksc *TopScoreCollector) Total() uint64 {
|
||||||
|
return tksc.total
|
||||||
|
}
|
||||||
|
|
||||||
|
func (tksc *TopScoreCollector) MaxScore() float64 {
|
||||||
|
return tksc.maxScore
|
||||||
|
}
|
||||||
|
|
||||||
|
func (tksc *TopScoreCollector) Took() time.Duration {
|
||||||
|
return tksc.took
|
||||||
|
}
|
||||||
|
|
||||||
|
func (tksc *TopScoreCollector) Collect(searcher Searcher) error {
|
||||||
|
startTime := time.Now()
|
||||||
|
next, err := searcher.Next()
|
||||||
|
for err == nil && next != nil {
|
||||||
|
tksc.collectSingle(next)
|
||||||
|
next, err = searcher.Next()
|
||||||
|
}
|
||||||
|
// compute search duration
|
||||||
|
tksc.took = time.Since(startTime)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (tksc *TopScoreCollector) collectSingle(dm *DocumentMatch) {
|
||||||
|
// increment total hits
|
||||||
|
tksc.total += 1
|
||||||
|
|
||||||
|
// update max score
|
||||||
|
if dm.Score > tksc.maxScore {
|
||||||
|
tksc.maxScore = dm.Score
|
||||||
|
}
|
||||||
|
|
||||||
|
for e := tksc.results.Front(); e != nil; e = e.Next() {
|
||||||
|
curr := e.Value.(*DocumentMatch)
|
||||||
|
if dm.Score < curr.Score {
|
||||||
|
|
||||||
|
tksc.results.InsertBefore(dm, e)
|
||||||
|
// if we just made the list too long
|
||||||
|
if tksc.results.Len() > tksc.k {
|
||||||
|
// remove the head
|
||||||
|
tksc.results.Remove(tksc.results.Front())
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// if we got to the end, we still have to add it
|
||||||
|
tksc.results.PushBack(dm)
|
||||||
|
if tksc.results.Len() > tksc.k {
|
||||||
|
// remove the head
|
||||||
|
tksc.results.Remove(tksc.results.Front())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (tksc *TopScoreCollector) Results() DocumentMatchCollection {
|
||||||
|
rv := make(DocumentMatchCollection, tksc.results.Len())
|
||||||
|
i := 0
|
||||||
|
for e := tksc.results.Back(); e != nil; e = e.Prev() {
|
||||||
|
rv[i] = e.Value.(*DocumentMatch)
|
||||||
|
i++
|
||||||
|
}
|
||||||
|
return rv
|
||||||
|
}
|
|
@ -0,0 +1,107 @@
|
||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
package search
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestTop10Scores(t *testing.T) {
|
||||||
|
|
||||||
|
// a stub search with more than 10 matches
|
||||||
|
// the top-10 scores are > 10
|
||||||
|
// everything else is less than 10
|
||||||
|
searcher := &stubSearcher{
|
||||||
|
matches: DocumentMatchCollection{
|
||||||
|
&DocumentMatch{
|
||||||
|
ID: "a",
|
||||||
|
Score: 11,
|
||||||
|
},
|
||||||
|
&DocumentMatch{
|
||||||
|
ID: "b",
|
||||||
|
Score: 9,
|
||||||
|
},
|
||||||
|
&DocumentMatch{
|
||||||
|
ID: "c",
|
||||||
|
Score: 11,
|
||||||
|
},
|
||||||
|
&DocumentMatch{
|
||||||
|
ID: "d",
|
||||||
|
Score: 9,
|
||||||
|
},
|
||||||
|
&DocumentMatch{
|
||||||
|
ID: "e",
|
||||||
|
Score: 11,
|
||||||
|
},
|
||||||
|
&DocumentMatch{
|
||||||
|
ID: "f",
|
||||||
|
Score: 9,
|
||||||
|
},
|
||||||
|
&DocumentMatch{
|
||||||
|
ID: "g",
|
||||||
|
Score: 11,
|
||||||
|
},
|
||||||
|
&DocumentMatch{
|
||||||
|
ID: "h",
|
||||||
|
Score: 9,
|
||||||
|
},
|
||||||
|
&DocumentMatch{
|
||||||
|
ID: "i",
|
||||||
|
Score: 11,
|
||||||
|
},
|
||||||
|
&DocumentMatch{
|
||||||
|
ID: "j",
|
||||||
|
Score: 11,
|
||||||
|
},
|
||||||
|
&DocumentMatch{
|
||||||
|
ID: "k",
|
||||||
|
Score: 11,
|
||||||
|
},
|
||||||
|
&DocumentMatch{
|
||||||
|
ID: "l",
|
||||||
|
Score: 99,
|
||||||
|
},
|
||||||
|
&DocumentMatch{
|
||||||
|
ID: "m",
|
||||||
|
Score: 11,
|
||||||
|
},
|
||||||
|
&DocumentMatch{
|
||||||
|
ID: "n",
|
||||||
|
Score: 11,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
collector := NewTopScorerCollector(10)
|
||||||
|
collector.Collect(searcher)
|
||||||
|
results := collector.Results()
|
||||||
|
|
||||||
|
if len(results) != 10 {
|
||||||
|
t.Fatalf("expected 10 results, got %d", len(results))
|
||||||
|
}
|
||||||
|
|
||||||
|
if results[0].ID != "l" {
|
||||||
|
t.Errorf("expected first result to have ID 'l', got %s", results[0].ID)
|
||||||
|
}
|
||||||
|
|
||||||
|
if results[0].Score != 99.0 {
|
||||||
|
t.Errorf("expected highest score to be 99.0, got %f", results[0].Score)
|
||||||
|
}
|
||||||
|
|
||||||
|
minScore := 1000.0
|
||||||
|
for _, result := range results {
|
||||||
|
if result.Score < minScore {
|
||||||
|
minScore = result.Score
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if minScore < 10 {
|
||||||
|
t.Errorf("expected minimum score to be higher than 10, got %f", minScore)
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,28 @@
|
||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
package search
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
|
)
|
||||||
|
|
||||||
|
type Explanation struct {
|
||||||
|
Value float64 `json:"value"`
|
||||||
|
Message string `json:"message"`
|
||||||
|
Children []*Explanation `json:"children,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
func (expl *Explanation) String() string {
|
||||||
|
js, err := json.MarshalIndent(expl, "", " ")
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Sprintf("error serializing explation to json: %v", err)
|
||||||
|
}
|
||||||
|
return string(js)
|
||||||
|
}
|
|
@ -0,0 +1,19 @@
|
||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
package search
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/couchbaselabs/bleve/index"
|
||||||
|
)
|
||||||
|
|
||||||
|
type Query interface {
|
||||||
|
Boost() float64
|
||||||
|
Searcher(index index.Index) (Searcher, error)
|
||||||
|
Validate() error
|
||||||
|
}
|
|
@ -0,0 +1,32 @@
|
||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
package search
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/couchbaselabs/bleve/index"
|
||||||
|
)
|
||||||
|
|
||||||
|
type TermQuery struct {
|
||||||
|
Term string `json:"term"`
|
||||||
|
Field string `json:"field,omitempty"`
|
||||||
|
BoostVal float64 `json:"boost,omitempty"`
|
||||||
|
Explain bool `json:"explain,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
func (q *TermQuery) Boost() float64 {
|
||||||
|
return q.BoostVal
|
||||||
|
}
|
||||||
|
|
||||||
|
func (q *TermQuery) Searcher(index index.Index) (Searcher, error) {
|
||||||
|
return NewTermSearcher(index, q)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (q *TermQuery) Validate() error {
|
||||||
|
return nil
|
||||||
|
}
|
|
@ -0,0 +1,172 @@
|
||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
package search
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"math"
|
||||||
|
|
||||||
|
"github.com/couchbaselabs/bleve/index"
|
||||||
|
)
|
||||||
|
|
||||||
|
const MAX_SCORE_CACHE = 64
|
||||||
|
|
||||||
|
type TermQueryScorer struct {
|
||||||
|
query *TermQuery
|
||||||
|
docTerm uint64
|
||||||
|
docTotal uint64
|
||||||
|
idf float64
|
||||||
|
explain bool
|
||||||
|
idfExplanation *Explanation
|
||||||
|
scoreCache map[int]float64
|
||||||
|
scoreExplanationCache map[int]*Explanation
|
||||||
|
queryNorm float64
|
||||||
|
queryWeight float64
|
||||||
|
queryWeightExplanation *Explanation
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewTermQueryScorer(query *TermQuery, docTotal, docTerm uint64, explain bool) *TermQueryScorer {
|
||||||
|
rv := TermQueryScorer{
|
||||||
|
query: query,
|
||||||
|
docTerm: docTerm,
|
||||||
|
docTotal: docTotal,
|
||||||
|
idf: 1.0 + math.Log(float64(docTotal)/float64(docTerm+1.0)),
|
||||||
|
explain: explain,
|
||||||
|
scoreCache: make(map[int]float64, MAX_SCORE_CACHE),
|
||||||
|
scoreExplanationCache: make(map[int]*Explanation, MAX_SCORE_CACHE),
|
||||||
|
queryWeight: 1.0,
|
||||||
|
}
|
||||||
|
|
||||||
|
if explain {
|
||||||
|
rv.idfExplanation = &Explanation{
|
||||||
|
Value: rv.idf,
|
||||||
|
Message: fmt.Sprintf("idf(docFreq=%d, maxDocs=%d)", docTerm, docTotal),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return &rv
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *TermQueryScorer) Weight() float64 {
|
||||||
|
sum := s.query.Boost() * s.idf
|
||||||
|
return sum * sum
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *TermQueryScorer) SetQueryNorm(qnorm float64) {
|
||||||
|
s.queryNorm = qnorm
|
||||||
|
|
||||||
|
// update the query weight
|
||||||
|
s.queryWeight = s.query.Boost() * s.idf * s.queryNorm
|
||||||
|
|
||||||
|
if s.explain {
|
||||||
|
childrenExplanations := make([]*Explanation, 3)
|
||||||
|
childrenExplanations[0] = &Explanation{
|
||||||
|
Value: s.query.Boost(),
|
||||||
|
Message: "boost",
|
||||||
|
}
|
||||||
|
childrenExplanations[1] = s.idfExplanation
|
||||||
|
childrenExplanations[2] = &Explanation{
|
||||||
|
Value: s.queryNorm,
|
||||||
|
Message: "queryNorm",
|
||||||
|
}
|
||||||
|
s.queryWeightExplanation = &Explanation{
|
||||||
|
Value: s.queryWeight,
|
||||||
|
Message: fmt.Sprintf("queryWeight(%s:%s^%f), product of:", s.query.Field, string(s.query.Term), s.query.Boost()),
|
||||||
|
Children: childrenExplanations,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *TermQueryScorer) Score(termMatch *index.TermFieldDoc) *DocumentMatch {
|
||||||
|
|
||||||
|
var scoreExplanation *Explanation
|
||||||
|
// see if the score was cached
|
||||||
|
score, ok := s.scoreCache[int(termMatch.Freq)]
|
||||||
|
if !ok {
|
||||||
|
// need to compute score
|
||||||
|
var tf float64
|
||||||
|
if termMatch.Freq < MAX_SQRT_CACHE {
|
||||||
|
tf = SQRT_CACHE[int(termMatch.Freq)]
|
||||||
|
} else {
|
||||||
|
tf = math.Sqrt(float64(termMatch.Freq))
|
||||||
|
}
|
||||||
|
|
||||||
|
score = tf * termMatch.Norm * s.idf
|
||||||
|
|
||||||
|
if s.explain {
|
||||||
|
childrenExplanations := make([]*Explanation, 3)
|
||||||
|
childrenExplanations[0] = &Explanation{
|
||||||
|
Value: tf,
|
||||||
|
Message: fmt.Sprintf("tf(termFreq(%s:%s)=%d", s.query.Field, string(s.query.Term), termMatch.Freq),
|
||||||
|
}
|
||||||
|
childrenExplanations[1] = &Explanation{
|
||||||
|
Value: termMatch.Norm,
|
||||||
|
Message: fmt.Sprintf("fieldNorm(field=%s, doc=%s)", s.query.Field, termMatch.ID),
|
||||||
|
}
|
||||||
|
childrenExplanations[2] = s.idfExplanation
|
||||||
|
scoreExplanation = &Explanation{
|
||||||
|
Value: score,
|
||||||
|
Message: fmt.Sprintf("fieldWeight(%s:%s in %s), product of:", s.query.Field, string(s.query.Term), termMatch.ID),
|
||||||
|
Children: childrenExplanations,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// if the query weight isn't 1, multiply
|
||||||
|
if s.queryWeight != 1.0 {
|
||||||
|
score = score * s.queryWeight
|
||||||
|
if s.explain {
|
||||||
|
childExplanations := make([]*Explanation, 2)
|
||||||
|
childExplanations[0] = s.queryWeightExplanation
|
||||||
|
childExplanations[1] = scoreExplanation
|
||||||
|
scoreExplanation = &Explanation{
|
||||||
|
Value: score,
|
||||||
|
Message: fmt.Sprintf("weight(%s:%s^%f in %s), product of:", s.query.Field, string(s.query.Term), s.query.Boost(), termMatch.ID),
|
||||||
|
Children: childExplanations,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if termMatch.Freq < MAX_SCORE_CACHE {
|
||||||
|
s.scoreCache[int(termMatch.Freq)] = score
|
||||||
|
if s.explain {
|
||||||
|
s.scoreExplanationCache[int(termMatch.Freq)] = scoreExplanation
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if ok && s.explain {
|
||||||
|
scoreExplanation = s.scoreExplanationCache[int(termMatch.Freq)]
|
||||||
|
}
|
||||||
|
|
||||||
|
rv := DocumentMatch{
|
||||||
|
ID: termMatch.ID,
|
||||||
|
Score: score,
|
||||||
|
}
|
||||||
|
if s.explain {
|
||||||
|
rv.Expl = scoreExplanation
|
||||||
|
}
|
||||||
|
|
||||||
|
if termMatch.Vectors != nil && len(termMatch.Vectors) > 0 {
|
||||||
|
locations := make(Locations, len(termMatch.Vectors))
|
||||||
|
for i, v := range termMatch.Vectors {
|
||||||
|
loc := Location{
|
||||||
|
Pos: float64(v.Pos),
|
||||||
|
Start: float64(v.Start),
|
||||||
|
End: float64(v.End),
|
||||||
|
}
|
||||||
|
locations[i] = &loc
|
||||||
|
}
|
||||||
|
tlm := make(TermLocationMap)
|
||||||
|
tlm[s.query.Term] = locations
|
||||||
|
rv.Locations = make(FieldTermLocationMap)
|
||||||
|
rv.Locations[s.query.Field] = tlm
|
||||||
|
}
|
||||||
|
|
||||||
|
return &rv
|
||||||
|
}
|
|
@ -0,0 +1,39 @@
|
||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
package search
|
||||||
|
|
||||||
|
type Location struct {
|
||||||
|
Pos float64 `json:"pos"`
|
||||||
|
Start float64 `json:"start"`
|
||||||
|
End float64 `json:"end"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type Locations []*Location
|
||||||
|
|
||||||
|
type TermLocationMap map[string]Locations
|
||||||
|
|
||||||
|
type FieldTermLocationMap map[string]TermLocationMap
|
||||||
|
|
||||||
|
type DocumentMatch struct {
|
||||||
|
ID string `json:"id"`
|
||||||
|
Score float64 `json:"score"`
|
||||||
|
Expl *Explanation `json:"explanation,omitempty"`
|
||||||
|
Locations FieldTermLocationMap `json:"locations,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type DocumentMatchCollection []*DocumentMatch
|
||||||
|
|
||||||
|
type Searcher interface {
|
||||||
|
Next() (*DocumentMatch, error)
|
||||||
|
Advance(ID string) (*DocumentMatch, error)
|
||||||
|
Close()
|
||||||
|
Weight() float64
|
||||||
|
SetQueryNorm(float64)
|
||||||
|
Count() uint64
|
||||||
|
}
|
|
@ -0,0 +1,84 @@
|
||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
package search
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/couchbaselabs/bleve/index"
|
||||||
|
)
|
||||||
|
|
||||||
|
type TermSearcher struct {
|
||||||
|
index index.Index
|
||||||
|
query *TermQuery
|
||||||
|
reader index.TermFieldReader
|
||||||
|
scorer *TermQueryScorer
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewTermSearcher(index index.Index, query *TermQuery) (*TermSearcher, error) {
|
||||||
|
reader, err := index.TermFieldReader([]byte(query.Term), query.Field)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
scorer := NewTermQueryScorer(query, index.DocCount(), reader.Count(), query.Explain)
|
||||||
|
return &TermSearcher{
|
||||||
|
index: index,
|
||||||
|
query: query,
|
||||||
|
reader: reader,
|
||||||
|
scorer: scorer,
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *TermSearcher) Count() uint64 {
|
||||||
|
return s.reader.Count()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *TermSearcher) Weight() float64 {
|
||||||
|
return s.scorer.Weight()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *TermSearcher) SetQueryNorm(qnorm float64) {
|
||||||
|
s.scorer.SetQueryNorm(qnorm)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *TermSearcher) Next() (*DocumentMatch, error) {
|
||||||
|
termMatch, err := s.reader.Next()
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if termMatch == nil {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// score match
|
||||||
|
docMatch := s.scorer.Score(termMatch)
|
||||||
|
// return doc match
|
||||||
|
return docMatch, nil
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *TermSearcher) Advance(ID string) (*DocumentMatch, error) {
|
||||||
|
termMatch, err := s.reader.Advance(ID)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if termMatch == nil {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// score match
|
||||||
|
docMatch := s.scorer.Score(termMatch)
|
||||||
|
|
||||||
|
// return doc match
|
||||||
|
return docMatch, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *TermSearcher) Close() {
|
||||||
|
s.reader.Close()
|
||||||
|
}
|
|
@ -0,0 +1,50 @@
|
||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
package search
|
||||||
|
|
||||||
|
type stubSearcher struct {
|
||||||
|
index int
|
||||||
|
matches DocumentMatchCollection
|
||||||
|
}
|
||||||
|
|
||||||
|
func (ss *stubSearcher) Next() (*DocumentMatch, error) {
|
||||||
|
if ss.index < len(ss.matches) {
|
||||||
|
rv := ss.matches[ss.index]
|
||||||
|
ss.index++
|
||||||
|
return rv, nil
|
||||||
|
}
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (ss *stubSearcher) Advance(ID string) (*DocumentMatch, error) {
|
||||||
|
|
||||||
|
for ss.index < len(ss.matches) && ss.matches[ss.index].ID < ID {
|
||||||
|
ss.index++
|
||||||
|
}
|
||||||
|
if ss.index < len(ss.matches) {
|
||||||
|
rv := ss.matches[ss.index]
|
||||||
|
ss.index++
|
||||||
|
return rv, nil
|
||||||
|
}
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (ss *stubSearcher) Close() {
|
||||||
|
}
|
||||||
|
|
||||||
|
func (ss *stubSearcher) Weight() float64 {
|
||||||
|
return 0.0
|
||||||
|
}
|
||||||
|
|
||||||
|
func (ss *stubSearcher) SetQueryNorm(float64) {
|
||||||
|
}
|
||||||
|
|
||||||
|
func (ss *stubSearcher) Count() uint64 {
|
||||||
|
return uint64(len(ss.matches))
|
||||||
|
}
|
|
@ -0,0 +1,24 @@
|
||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
package search
|
||||||
|
|
||||||
|
import (
|
||||||
|
"math"
|
||||||
|
)
|
||||||
|
|
||||||
|
var SQRT_CACHE map[int]float64
|
||||||
|
|
||||||
|
const MAX_SQRT_CACHE = 64
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
SQRT_CACHE = make(map[int]float64, MAX_SQRT_CACHE)
|
||||||
|
for i := 0; i < MAX_SQRT_CACHE; i++ {
|
||||||
|
SQRT_CACHE[i] = math.Sqrt(float64(i))
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,64 @@
|
||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
package shredder
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/json"
|
||||||
|
"strconv"
|
||||||
|
|
||||||
|
"github.com/couchbaselabs/bleve/document"
|
||||||
|
)
|
||||||
|
|
||||||
|
// A simple automatic JSON shredder which parses the whole document body.
|
||||||
|
// Any strings found in the JSON are added as text fields
|
||||||
|
|
||||||
|
type AutoJsonShredder struct {
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewAutoJsonShredder() *AutoJsonShredder {
|
||||||
|
return &AutoJsonShredder{}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *AutoJsonShredder) Shred(id string, body []byte) (*document.Document, error) {
|
||||||
|
rv := document.NewDocument(id)
|
||||||
|
|
||||||
|
var section interface{}
|
||||||
|
err := json.Unmarshal(body, §ion)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
shredSection(rv, section, "")
|
||||||
|
|
||||||
|
return rv, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func shredSection(doc *document.Document, section interface{}, parent string) {
|
||||||
|
nextParent := parent
|
||||||
|
if nextParent != "" {
|
||||||
|
nextParent = nextParent + "."
|
||||||
|
}
|
||||||
|
switch section := section.(type) {
|
||||||
|
|
||||||
|
case string:
|
||||||
|
f := document.NewTextField(parent, []byte(section))
|
||||||
|
doc.AddField(f)
|
||||||
|
|
||||||
|
case []interface{}:
|
||||||
|
for i, sub := range section {
|
||||||
|
shredSection(doc, sub, nextParent+strconv.Itoa(i))
|
||||||
|
}
|
||||||
|
|
||||||
|
case map[string]interface{}:
|
||||||
|
for k, sub := range section {
|
||||||
|
shredSection(doc, sub, nextParent+k)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,55 @@
|
||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
package shredder
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/couchbaselabs/bleve/document"
|
||||||
|
"github.com/dustin/go-jsonpointer"
|
||||||
|
)
|
||||||
|
|
||||||
|
// A simple automatic JSON shredder which parses the whole document body.
|
||||||
|
// Any strings found in the JSON are added as text fields
|
||||||
|
|
||||||
|
type JsonPointerShredder struct {
|
||||||
|
fieldPaths map[string]string
|
||||||
|
paths []string
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewJsonPointerShredder() *JsonPointerShredder {
|
||||||
|
return &JsonPointerShredder{
|
||||||
|
fieldPaths: make(map[string]string),
|
||||||
|
paths: make([]string, 0),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *JsonPointerShredder) AddTextField(name string, path string) {
|
||||||
|
s.fieldPaths[name] = path
|
||||||
|
s.paths = append(s.paths, path)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *JsonPointerShredder) AddField(name string, path string) {
|
||||||
|
s.fieldPaths[name] = path
|
||||||
|
s.paths = append(s.paths, path)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *JsonPointerShredder) Shred(id string, body []byte) (*document.Document, error) {
|
||||||
|
rv := document.NewDocument(id)
|
||||||
|
|
||||||
|
values, err := jsonpointer.FindMany(body, s.paths)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
for fieldName, fieldPath := range s.fieldPaths {
|
||||||
|
field := document.NewTextField(fieldName, values[fieldPath])
|
||||||
|
rv.AddField(field)
|
||||||
|
}
|
||||||
|
|
||||||
|
return rv, nil
|
||||||
|
}
|
|
@ -0,0 +1,17 @@
|
||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
package shredder
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/couchbaselabs/bleve/document"
|
||||||
|
)
|
||||||
|
|
||||||
|
type Shredder interface {
|
||||||
|
Shred(id string, body []byte) (document.Document, error)
|
||||||
|
}
|
|
@ -0,0 +1,31 @@
|
||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"flag"
|
||||||
|
"log"
|
||||||
|
|
||||||
|
"github.com/couchbaselabs/bleve/index/upside_down"
|
||||||
|
)
|
||||||
|
|
||||||
|
var indexDir = flag.String("indexDir", "index", "index directory")
|
||||||
|
|
||||||
|
func main() {
|
||||||
|
flag.Parse()
|
||||||
|
|
||||||
|
index := upside_down.NewUpsideDownCouch(*indexDir)
|
||||||
|
err := index.Open()
|
||||||
|
if err != nil {
|
||||||
|
log.Fatal(err)
|
||||||
|
}
|
||||||
|
defer index.Close()
|
||||||
|
|
||||||
|
index.Dump()
|
||||||
|
}
|
Loading…
Reference in New Issue