0
0
Fork 0

added an analyzer to get the all chunks out of content

this will be used for building the index
This commit is contained in:
Gibheer 2011-06-10 20:45:16 +02:00
parent 9fa082d8d9
commit 6d2ffb8ec7
3 changed files with 104 additions and 0 deletions

View File

@ -0,0 +1,42 @@
require 'polecat/stemmer/simple'
module Polecat
# a simple analyzer which is enough for most english content
#
# This class analyzes content in a standard english way, which should be
# enough for the most cases.
# It splits words of the content at whitespaces and applies the stemmer on
# every part found.
class StandardAnalyzer
# return the used stemmer
# @return [Stemmer, #stem] an object which knows the method #stem
attr_reader :stemmer
# create a new analyzer
#
# This creates a new analyzer with the given stemmer or a simple stemmer. A
# stemmer has to know the method stem, as it is used in the analyzer to
# stem the chunks found in the content.
#
# As this is a standard analyzer it should work for most cases. Change the
# stemmer if the words found do not match your expectations.
# @param [Stemmer, #stem] a stemmer with the method #stem
def initialize(stemmer = Polecat::SimpleStemmer.new)
unless stemmer.respond_to? :stem
raise ArgumentError, 'stemmer does not know #stem'
end
@stemmer = stemmer
end
# analyzes the content and stems every chunk found
#
# This method splits the content into chunks and applies the stemmer on it.
# The result get's put into the index.
def analyze content
unless content.respond_to? :split
raise ArgumentError, "#{content.class} has no #split"
end
content.split(/\s/).map {|w| @stemmer.stem w }
end
end
end

View File

@ -0,0 +1,40 @@
require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
require 'polecat/analyzer/standard'
describe "StandardAnalyzer#analyze" do
let (:s) { s = double; s.stub(:stem); s }
let (:a) { Polecat::StandardAnalyzer.new s }
it "takes an argument" do
a.method(:analyze).arity.should be(1)
end
it "takes an object which implements the method split" do
s.stub(:stem) { "foo" }
a.analyze("foo").should == ["foo"]
end
it "returns an array of found elements" do
a.analyze("foo").count.should be(1)
end
it "calls the method #stem of the stemmer for every chunk found" do
s.stub(:stem) { "stem" }
s.should_receive(:stem).with("stemmed")
a.analyze("stemmed").should == ["stem"]
end
it "splits the content at whitespaces" do
a.analyze("foo bar baz").count.should == 3
end
it "calls the method #stem for every word in the content" do
s.stub(:stem) { "stem" }
s.should_receive(:stem).with("stemmed")
a.analyze("stemmed stemmer").should == ["stem", "stem"]
end
it "raises an error when the argument has not method split" do
lambda { a.analyze 2 }.should raise_error(ArgumentError)
end
end

View File

@ -0,0 +1,22 @@
require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
require 'polecat/stemmer/simple'
require 'polecat/analyzer/standard'
describe "StandardAnalyzer#new" do
let (:s) { Polecat::SimpleStemmer.new }
it "uses the simple stemmer as default" do
a = Polecat::StandardAnalyzer.new
a.stemmer.class.should == Polecat::SimpleStemmer
end
it "takes an object as an argument which has a method #stem" do
a = Polecat::StandardAnalyzer.new s
a.stemmer.should be(s)
end
it "raises an error when the argument does not know the method #stem" do
lambda { Polecat::StandardAnalyzer.new "foo" }.should(
raise_error(ArgumentError))
end
end