From fb0db7066fb7d033218b5beb7070ccc446442a68 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Fri, 6 Feb 2015 12:35:01 -0500 Subject: [PATCH] improve highlighting with multi-byte characters fixes #156 --- .../fragmenters/simple/fragmenter_simple.go | 58 +++++++++++++++---- .../simple/fragmenter_simple_test.go | 28 ++++++++- 2 files changed, 73 insertions(+), 13 deletions(-) diff --git a/search/highlight/fragmenters/simple/fragmenter_simple.go b/search/highlight/fragmenters/simple/fragmenter_simple.go index 3a53074f..d78aacaa 100644 --- a/search/highlight/fragmenters/simple/fragmenter_simple.go +++ b/search/highlight/fragmenters/simple/fragmenter_simple.go @@ -10,6 +10,8 @@ package simple import ( + "unicode/utf8" + "github.com/blevesearch/bleve/registry" "github.com/blevesearch/bleve/search/highlight" ) @@ -32,21 +34,38 @@ func (s *Fragmenter) Fragment(orig []byte, ot highlight.TermLocations) []*highli rv := make([]*highlight.Fragment, 0) maxbegin := 0 +OUTER: for currTermIndex, termLocation := range ot { // start with this // it should be the highest scoring fragment with this term first start := termLocation.Start - end := start + s.fragmentSize - if end > len(orig) { - end = len(orig) - // we hit end, so push back as far as we can without crossing maxbegin - extra := s.fragmentSize - (end - start) - if start-extra >= maxbegin { - start -= extra + end := start + used := 0 + for end < len(orig) && used < s.fragmentSize { + r, size := utf8.DecodeRune(orig[end:]) + if r == utf8.RuneError { + continue OUTER // bail + } + end += size + used += 1 + } + + // if we still have more characters available to us + // push back towards begining + // without cross maxbegin + for start > 0 && used < s.fragmentSize { + r, size := utf8.DecodeLastRune(orig[0:start]) + if r == utf8.RuneError { + continue OUTER // bail + } + if start-size >= maxbegin { + start -= size + used += 1 } else { - start = maxbegin + break } } + // however, we'd rather have the tokens centered more in the frag // lets try to do that as best we can, without affecting the score // find the end of the last term in this fragment @@ -59,12 +78,29 @@ func (s *Fragmenter) Fragment(orig []byte, ot highlight.TermLocations) []*highli } // find the smaller of the two rooms to move - roomToMove := end - minend - if start-maxbegin < roomToMove { - roomToMove = start - maxbegin + roomToMove := utf8.RuneCount(orig[minend:end]) + roomToMoveStart := utf8.RuneCount(orig[maxbegin:start]) + if roomToMoveStart < roomToMove { + roomToMove = roomToMoveStart } offset := roomToMove / 2 + + for offset > 0 { + r, size := utf8.DecodeLastRune(orig[0:start]) + if r == utf8.RuneError { + continue OUTER // bail + } + start -= size + + r, size = utf8.DecodeLastRune(orig[0:end]) + if r == utf8.RuneError { + continue OUTER // bail + } + end -= size + offset-- + } + rv = append(rv, &highlight.Fragment{Orig: orig, Start: start - offset, End: end - offset}) // set maxbegin to the end of the current term location // so that next one won't back up to include it diff --git a/search/highlight/fragmenters/simple/fragmenter_simple_test.go b/search/highlight/fragmenters/simple/fragmenter_simple_test.go index 5c793145..99ada6e1 100644 --- a/search/highlight/fragmenters/simple/fragmenter_simple_test.go +++ b/search/highlight/fragmenters/simple/fragmenter_simple_test.go @@ -22,6 +22,7 @@ func TestSimpleFragmenter(t *testing.T) { orig []byte fragments []*highlight.Fragment ot highlight.TermLocations + size int }{ { orig: []byte("this is a test"), @@ -40,6 +41,7 @@ func TestSimpleFragmenter(t *testing.T) { End: 14, }, }, + size: 100, }, { orig: []byte("0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789"), @@ -58,6 +60,7 @@ func TestSimpleFragmenter(t *testing.T) { End: 100, }, }, + size: 100, }, { orig: []byte("01234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890"), @@ -175,16 +178,37 @@ func TestSimpleFragmenter(t *testing.T) { End: 100, }, }, + size: 100, + }, + { + orig: []byte("[[पानी का स्वाद]] [[नीलेश रघुवंशी]] का कविता संग्रह हैं। इस कृति के लिए उन्हें २००४ में [[केदार सम्मान]] से सम्मानित किया गया है।{{केदार सम्मान से सम्मानित कृतियाँ}}"), + fragments: []*highlight.Fragment{ + &highlight.Fragment{ + Orig: []byte("[[पानी का स्वाद]] [[नीलेश रघुवंशी]] का कविता संग्रह हैं। इस कृति के लिए उन्हें २००४ में [[केदार सम्मान]] से सम्मानित किया गया है।{{केदार सम्मान से सम्मानित कृतियाँ}}"), + Start: 0, + End: 411, + }, + }, + ot: highlight.TermLocations{ + &highlight.TermLocation{ + Term: "पानी", + Pos: 1, + Start: 2, + End: 14, + }, + }, + size: 200, }, } - fragmenter := NewFragmenter(100) for _, test := range tests { + fragmenter := NewFragmenter(test.size) fragments := fragmenter.Fragment(test.orig, test.ot) if !reflect.DeepEqual(fragments, test.fragments) { t.Errorf("expected %#v, got %#v", test.fragments, fragments) for _, fragment := range fragments { - t.Logf("frag: %#v", fragment) + t.Logf("frag: %s", fragment.Orig[fragment.Start:fragment.End]) + t.Logf("frag: %d - %d", fragment.Start, fragment.End) } } }