added an analyzer to get the all chunks out of content
this will be used for building the index
This commit is contained in:
parent
9fa082d8d9
commit
6d2ffb8ec7
|
@ -0,0 +1,42 @@
|
|||
require 'polecat/stemmer/simple'
|
||||
|
||||
module Polecat
|
||||
# a simple analyzer which is enough for most english content
|
||||
#
|
||||
# This class analyzes content in a standard english way, which should be
|
||||
# enough for the most cases.
|
||||
# It splits words of the content at whitespaces and applies the stemmer on
|
||||
# every part found.
|
||||
class StandardAnalyzer
|
||||
# return the used stemmer
|
||||
# @return [Stemmer, #stem] an object which knows the method #stem
|
||||
attr_reader :stemmer
|
||||
|
||||
# create a new analyzer
|
||||
#
|
||||
# This creates a new analyzer with the given stemmer or a simple stemmer. A
|
||||
# stemmer has to know the method stem, as it is used in the analyzer to
|
||||
# stem the chunks found in the content.
|
||||
#
|
||||
# As this is a standard analyzer it should work for most cases. Change the
|
||||
# stemmer if the words found do not match your expectations.
|
||||
# @param [Stemmer, #stem] a stemmer with the method #stem
|
||||
def initialize(stemmer = Polecat::SimpleStemmer.new)
|
||||
unless stemmer.respond_to? :stem
|
||||
raise ArgumentError, 'stemmer does not know #stem'
|
||||
end
|
||||
@stemmer = stemmer
|
||||
end
|
||||
|
||||
# analyzes the content and stems every chunk found
|
||||
#
|
||||
# This method splits the content into chunks and applies the stemmer on it.
|
||||
# The result get's put into the index.
|
||||
def analyze content
|
||||
unless content.respond_to? :split
|
||||
raise ArgumentError, "#{content.class} has no #split"
|
||||
end
|
||||
content.split(/\s/).map {|w| @stemmer.stem w }
|
||||
end
|
||||
end
|
||||
end
|
|
@ -0,0 +1,40 @@
|
|||
require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
||||
require 'polecat/analyzer/standard'
|
||||
|
||||
describe "StandardAnalyzer#analyze" do
|
||||
let (:s) { s = double; s.stub(:stem); s }
|
||||
let (:a) { Polecat::StandardAnalyzer.new s }
|
||||
|
||||
it "takes an argument" do
|
||||
a.method(:analyze).arity.should be(1)
|
||||
end
|
||||
|
||||
it "takes an object which implements the method split" do
|
||||
s.stub(:stem) { "foo" }
|
||||
a.analyze("foo").should == ["foo"]
|
||||
end
|
||||
|
||||
it "returns an array of found elements" do
|
||||
a.analyze("foo").count.should be(1)
|
||||
end
|
||||
|
||||
it "calls the method #stem of the stemmer for every chunk found" do
|
||||
s.stub(:stem) { "stem" }
|
||||
s.should_receive(:stem).with("stemmed")
|
||||
a.analyze("stemmed").should == ["stem"]
|
||||
end
|
||||
|
||||
it "splits the content at whitespaces" do
|
||||
a.analyze("foo bar baz").count.should == 3
|
||||
end
|
||||
|
||||
it "calls the method #stem for every word in the content" do
|
||||
s.stub(:stem) { "stem" }
|
||||
s.should_receive(:stem).with("stemmed")
|
||||
a.analyze("stemmed stemmer").should == ["stem", "stem"]
|
||||
end
|
||||
|
||||
it "raises an error when the argument has not method split" do
|
||||
lambda { a.analyze 2 }.should raise_error(ArgumentError)
|
||||
end
|
||||
end
|
|
@ -0,0 +1,22 @@
|
|||
require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
||||
require 'polecat/stemmer/simple'
|
||||
require 'polecat/analyzer/standard'
|
||||
|
||||
describe "StandardAnalyzer#new" do
|
||||
let (:s) { Polecat::SimpleStemmer.new }
|
||||
|
||||
it "uses the simple stemmer as default" do
|
||||
a = Polecat::StandardAnalyzer.new
|
||||
a.stemmer.class.should == Polecat::SimpleStemmer
|
||||
end
|
||||
|
||||
it "takes an object as an argument which has a method #stem" do
|
||||
a = Polecat::StandardAnalyzer.new s
|
||||
a.stemmer.should be(s)
|
||||
end
|
||||
|
||||
it "raises an error when the argument does not know the method #stem" do
|
||||
lambda { Polecat::StandardAnalyzer.new "foo" }.should(
|
||||
raise_error(ArgumentError))
|
||||
end
|
||||
end
|
Loading…
Reference in New Issue