# Copyright 2002-2008 Josh Clark and Global Moxie, LLC. This code cannot be
# redistributed without permission from globalmoxie.com.  For more
# information, consult your Big Medium license.
#
# $Id: WordParser.pm 3043 2008-03-31 14:00:38Z josh $

package BigMed::Search::WordParser;
use strict;
use warnings;
use utf8;
use Lingua::Stem;
use BigMed::Search::Stopwords;

#locales available via pure-perl Lingua::Stem
my @STEM_LOCALES = qw(da de en en-us en-uk fr gl no pt sv);
my %CAN_STEM;
@CAN_STEM{@STEM_LOCALES} = @STEM_LOCALES;

sub new {
    my ( $class, $locale ) = @_;
    $locale = lc( $locale || 'en-us' );
    $locale = 'en-us' if $locale eq 'en';
    $locale =~ s/_/-/msg;

    my $self = bless { locale => $locale }, $class;
    return $self;
}

sub get_words {
    my ( $self, $text ) = @_;

    my $rstop = $self->stopwords;
    $text = lc $text;
    
    #originally tried splitting words by regex grouping:
    #    @words = ( $text =~ /(\p{L}+)/ )
    #this worked fine for 5.8, but 5.6 mangled the characters.
    #splitting seems to do the trick.
    
    my @words = grep {
        s/\A\d+\z//ms;                       #strip out numbers
        
        #not a single alpha character, or a stopword
        $_ && $_ !~ /\A[a-zA-Z]\z/ && !exists $rstop->{$_};
      }
      split(/[^\p{L}]+/, $text); #\p{L}: unicode letter property

    #stem the words
    my $stemmer = $self->stemmer;
    if ($stemmer) {
        my $method =
          index( $self->locale, 'en' ) == 0 ? 'stem_in_place' : 'stem';

        #stemmer can return empty words, notably for numbers; strip 'em
        return [grep { $_ } @{ $stemmer->$method(@words) }];
    }

    return \@words;
}

sub locale {
    return $_[0]->{locale};
}

sub stemmer {
    my $self = shift;
    return $self->{stemmer} if exists $self->{stemmer};

    my $stemmer;
    if ( $CAN_STEM{ $self->{locale} } ) {
        $stemmer = Lingua::Stem->new( -locale => $self->{locale} );
        $stemmer->stem_caching( { -level => 2 } );
    }
    $self->{stemmer} = $stemmer;
    return $stemmer;
}

sub stopwords {
    my $self = shift;
    $self->{stopwords} = BigMed::Search::Stopwords->hashref( $self->{locale} )
      if !exists $self->{stopwords};
    return $self->{stopwords};
}

1;
__END__

=head1 NAME

BigMed::Search::WordParser - Extracts and stems words for indexing

=head1 DESCRIPTION

Extracts words from a string of text, stemming words (for supported
languages) and omitting stopwords (also for supported languages).

=head1 SUMMARY

    use BigMed::Search::WordParser;
    my $parser = BigMed::Search::WordParser->new('en');
    my $text = 'The quick brown fox jumped over the lazy dog quickly';
    my $rwords = $parser->get_words($text);

    # $rwords is an array reference:
    # ['quick', 'brown', 'fox', 'jump', lazi', 'dog', 'quickli']
    #
    # note that 'jumped' is stemmed to 'jump', 'lazy' to 'lazi'
    # and 'quickly' to 'quickli';
    # 'the' and 'over' are omitted
    
=head1 METHODS

=head2 new

    $parser = BigMed::Search::WordParser->new($iso_locale)
    $parser = BigMed::Search::WordParser->new() #locale default is 'en-us'

Returns a BigMed::Search::WordParser object. You can specify the ISO
locale in the first argument to tell the parser what language you're
parsing. The locale is case-insensitive, and the default is C<en-us>.

=head2 get_words

    my $rwords = $parser->get_words($text);

Returns an array reference of words found in the text (the text must
use UTF-8 encoding). Punctuation, space, symbols and marks are all
considered word boundaries and are not included in the returned words.

For supported languages, the words are stemmed via Lingua::Stem.
Stemming removes grammatical information from words to find their root,
treating both "training" and "trains," for example, as "train."
Stemming is supported for the following languages (words in other languages
go unmodified):

=over 4

=item * German (de)

=item * English (en, en-us, en-uk)

=item * French (fr)

=item * Galician (gl)

=item * Portuguese (pt)

=back

A list of built-in stopwords are removed from the list for supported
languages. Stopwords are words that are so common as to have little
functional meaning. In English, examples are "of", "the", "it", "you", and
"and." In all cases, words of just one character are removed. Languages
with built-in stopwords are:

=over 4

=item * Czech (cs)

=item * Danish (da)

=item * German (de)

=item * English (en)

=item * Spanish (es)

=item * Finnish (fi)

=item * French (fr)

=item * Italian (it)

=item * Dutch (nl)

=item * Norwegian (no)

=item * Portuguese (pt)

=item * Sweedish (sv)

=back

=head2 locale

    my $locale = $parser->locale;

Returns the locale that was set when the object was constructed.

=head2 stemmer

    my $stemmer = $parser->stemmer

Returns a Lingua::Stem object to use for stemming words in the current
locale. If no stemmer is available for the locale, returns false.

=head2 stopwords

    my $rhash = $parser->stopwords

Wrapper to the hashref method of BigMed::Search::Stopwords for the
current locale. Returns a hash reference whose keys are the stopwords
to use and whose values are true.

=head1 AUTHOR & COPYRIGHTS

This module and all Big Medium modules are copyright Josh Clark
and Global Moxie. All rights reserved.

Use of this module and the Big Medium content
management system are governed by Global Moxie's software licenses
and may not be used outside of the terms and conditions outlined
there.

For more information, visit the Global Moxie website at
L<http://globalmoxie.com/>.

Big Medium and Global Moxie are service marks of Global Moxie
and Josh Clark. All rights reserved.

=cut

