# Copyright 2002-2008 Josh Clark and Global Moxie, LLC. This code cannot be
# redistributed without permission from globalmoxie.com.  For more
# information, consult your Big Medium license.
#
# $Id: Indexer.pm 3106 2008-06-11 19:15:08Z josh $

package BigMed::Search::Indexer;
use strict;
use warnings;
use utf8;
use Carp;
$Carp::Verbose = 1;

use BigMed::Search::WordParser;
use BigMed;
use BigMed::DiskUtil qw(
    bm_file_path        bm_copy_file
    bm_delete_file      bm_datafile_chmod
    bm_confirm_dir      bm_move_file
    bm_untaint_filepath
 );
use BigMed::Search::PageIndex;
use BigMed::Pointer;
use BigMed::Trigger;
use BigMed::Log;
use Fcntl qw(:DEFAULT :flock);

my $PERL_5_8 = ( $] >= 5.008 ); #determines utf-8 handling
my $ERR = 'BigMed::Error';
my $MID_PAGE_COUNT = 10; #page interval to call mid_index trigger

# CONSTRUCTOR ------------------------------------------------------

sub new {
    my ( $class, $locale ) = @_;
    $locale = lc( $locale || 'en-us' );
    $locale = 'en-us' if $locale eq 'en';
    $locale =~ s/_/-/msg;

    my $self = bless {
        locale => $locale,
        parser => BigMed::Search::WordParser->new($locale),
        pcount => 0,
        total_words => 0,
    }, $class;
    return $self;
}

# FIELDS AND RELATIONS TO INDEX ------------------------------------

my %INDEX_FIELD = (
    'title' => {
        weight => 1,
        parse  => \&html_to_text,
    },
    'description' => {
        weight => 0.8,
        parse  => \&richtext_to_text,
    },
    'meta_description' => {
        weight => 0.8,
        parse  => \&html_to_text,
    },
    'meta_keywords' => {
        weight => 0.8,
        parse  => \&html_to_text,
    },
    'content' => {
        weight => 0.6,
        parse  => \&richtext_to_text,
    },
);

my %INDEX_RELATION = (
    'tag' => {
        weight => 0.8,
        parse  => \&tags_to_text,
    },
    'author' => {
        weight => 0.6,
        parse  => \&author_to_text,
    },
    'media' => {
        weight => 0.4,
        parse  => \&media_to_text,
    },
    'comment' => {
        weight => 0.2,
        parse  => \&comment_to_text,
    },

    #subtype-specific
    'download' => {
        weight  => 1,
        parse   => \&download_to_text,
        subtype => { 'download' => 1 },
    },
    'podcast' => {
        weight  => 1,
        parse   => \&podcast_to_text,
        subtype => { 'podcast' => 1 },
    },
    'link_url' => {
        weight  => 1,
        parse   => \&link_url_to_text,
        subtype => { 'link' => 1 },
    },
);

sub register_field {
    my $self = shift;
    _register( \%INDEX_FIELD, @_ );
    return;
}

sub register_relation {
    my $self = shift;
    _register( \%INDEX_RELATION, @_ );
    return;
}

sub _register {
    my $rindex = shift;
    foreach my $rhash (@_) {
        my $name = $rhash->{name}
          or croak 'hash reference must include name value';
        my $rmeta = $rindex->{$name} || {};
        $rmeta->{weight} = $rhash->{weight} if exists $rhash->{weight};
        $rmeta->{parse} = $rhash->{parse} if ref $rhash->{parse} eq 'CODE';
        if ( $rhash->{subtype} ) {
            my @subtypes =
              ref $rhash->{subtype} eq 'ARRAY'
              ? @{ $rhash->{subtype} }
              : ( $rhash->{subtype} );
            $rmeta->{subtype} = { map { $_ => 1 } @subtypes };
        }
        $rmeta->{parse} = $rhash->{parse} if ref $rhash->{parse} eq 'CODE';
        if ( $rmeta->{weight} && $rmeta->{weight} > 0 ) {
            croak "no parse subroutine supplied for $name registration"
              if !$rmeta->{parse};
            $rindex->{$name} = $rmeta;
        }
        else {
            delete $rindex->{$name};
        }
    }
    return;
}

# MAIN INDEX METHODS -----------------------------------------------

sub index_page {
    my ( $self, $data ) = @_;
    $self->{pcount} = $self->{wcount} = $self->{total_words} = 0;

    my ( $site, %add, %delete, @deindex );
    my $pcount = 0;
    my $rtally_page = sub {
        my ( $page, $index_cache, $rcache ) = @_;
        $site ||= $page->site;
        croak 'all objects must be from same site' if $site != $page->site;

        #never index pages with the "suppress all links" flag
        my %flag = $page->flags;
        if ( $flag{hideall} ) {
            push @deindex, $page->id;
            return 1;
        }

        my ( $rwcount, $rdelete ) =
          $self->_update_page_index( $page, $index_cache, $rcache );
        return if !$rwcount;    #i/o error

        my $id = $page->id;
        foreach my $word ( keys %{$rwcount} ) {
            $add{$word}->{$id} = $rwcount->{$word};
        }
        foreach my $word ( @{$rdelete} ) {
            push @{ $delete{$word} }, $id;
        }
        $self->{pcount}      = ++$pcount;
        $self->{total_words} = scalar keys %add;
        $self->call_trigger('mid_index') if !( $pcount % $MID_PAGE_COUNT );
        return 1;
    };

    if ( $data->isa('BigMed::Content::Page') ) {
        $rtally_page->($data) or return; #no index or pointer cache
        return $self->remove_page($data) if @deindex;
    }
    elsif ( $data->isa('BigMed::Driver') ) {
        $data->set_index(0);
        my ( $page, $index_cache, $rcache );
        if ( $data->count > 2 ) {    #load the pointer and index caches
            my $site = $data->next_index->{site};
            $data->set_index(0);
            my $pcache = BigMed::Pointer->select( { site => $site } ) or return;
            $pcache->tune('source_id');
            $rcache = { 'BigMed::Pointer' => $pcache };
            $index_cache =
              BigMed::Search::PageIndex->select( { site => $site } )
              or return;
            $index_cache->tune('page');
        }
        while ( $page = $data->next ) {
            $rtally_page->( $page, $index_cache, $rcache ) or return;
        }
        return if !defined $page;
        
        if (@deindex) { #one or more suppressed pages, remove them
            $self->log( 'info' => "Indexer: Found page(s) to suppress: "
                  . join( ', ', @deindex ) );
            my $remove = $data->select({site=>$site, id=>\@deindex})
              or return;
            $self->remove_page($remove) or return;
        }
        $data->set_index(0);
    }
    $self->log(
        'info' => "Indexer: Adding page(s) to search index for site $site" )
      if $site && %add;

    return $self->_update_keyword_index( \%add, \%delete, $site );
}

sub remove_page {    #self, array of objects and/or selects
    my ( $self, $data ) = @_;
    my ( $site, @page_ids );
    $self->{pcount} = $self->{wcount} = $self->{total_words} = 0;
    
    #collect page ids to update
    if (ref $data eq 'HASH') {
        $site = $data->{site}
          or croak 'remove_page requires site parameter in hash argument';
        $site = $site->id if ref $site;

        my $pids = $data->{pages};
        croak 'remove_page requires array ref in pages parameter of hash arg'
          if ref $pids ne 'ARRAY';
        @page_ids = @{$pids};
    }
    elsif ( $data->isa('BigMed::Content::Page') ) {
        push @page_ids, $data->id;
        $site = $data->site;
    }
    elsif ( $data->isa('BigMed::Driver') ) {
        $data->set_index(0);
        while ( my $rindex = $data->next_index ) {
            push @page_ids, $rindex->{id};
            $site ||= $rindex->{site};
            croak 'remove_page requires all objects to be from same site'
              if $site != $rindex->{site};
        }
        $data->set_index(0);
    }
    if ( !@page_ids || !$site ) {
        $site ||= q{};
        $self->log( info => "Indexer: page removal requested for site $site "
              . ' but no pages or no site specified' );
        return 1;
    }

    my $rpages = \@page_ids;
    $self->log( info => "Indexer: Page removal requested for site $site: "
          . join( ', ', @page_ids ) );

    #gather words for all pages
    my $indices =
      BigMed::Search::PageIndex->select( { site => $site, page => $rpages } )
      or return;
    $self->{total_words} = $indices->count;
    my ( $pindex, %word_kw );
    while ( $pindex = $indices->next ) {
        my @pwords;
        if ($PERL_5_8) {
            @pwords = $pindex->words;
        }
        else { #perl 5.6: convert to bytes for storage in hash
            @pwords = map { pack "C*", unpack "U0C*", $_ } $pindex->words;
        }
        @word_kw{ @pwords } = ($rpages) x @pwords;
        $self->{wcount} = scalar keys %word_kw;
        $self->call_trigger('mid_index') if !( ++$self->{pcount} % $MID_PAGE_COUNT );
    }
    return   if !defined $pindex;        #i/o error
    return 1 if !%word_kw;               #no words

    #delete page indices
    $indices->trash_all or return;
    undef $indices;

    #update keyword records
    $self->log( 'info' =>
          "Indexer: Removed page(s) from search index for site $site" );
    return $self->_update_keyword_index( {}, \%word_kw, $site );
}

sub count_page_words {
    my ( $self, $page, $rcache ) = @_;
    my $tcount = 0;    #will be weighted word count
    my $parser = $self->{parser};

    #gather words from each field and tally the word's weighted count
    my %wcount;
    my $rtally_score = sub {
        my ($text, $weight) = @_;
        my $rwords = $parser->get_words($text);
        $tcount += $weight * @{$rwords};
        foreach my $w ( @{$rwords} ) {
            if ( !$PERL_5_8 ) { #v5.6 can't do unicode hash keys
                $w = pack "C*", unpack "U0C*", $w;
            }
            $wcount{$w} ||= 0;
            $wcount{$w} += $weight;
        }
    };

    foreach my $col ( keys %INDEX_FIELD ) {
        my $value = $page->$col or next;
        my $rinfo = $INDEX_FIELD{$col};
        next
          if $rinfo->{subtype}
          && ( !$page->subtype || !$rinfo->{subtype}->{ $page->subtype } );

        $value = $rinfo->{parse}->($value) or next;
        $rtally_score->($value, $rinfo->{weight});
    }

    #get the content for related fields
    foreach my $rel ( keys %INDEX_RELATION ) {
        my $rinfo = $INDEX_RELATION{$rel};
        next
          if $rinfo->{subtype}
          && ( !$page->subtype || !$rinfo->{subtype}->{ $page->subtype } );
        my $value  = $rinfo->{parse}->( $page, $rcache ) or next;
        $rtally_score->($value, $rinfo->{weight});
    }

    #got all words in a hash with raw score; adjust for overall word count;
    #result is the weighted term frequency within the document
    #(the "tf" in tf-idf); will get stored in keyword record.
    #
    #store as integer to make comparison easier in testing; we lose
    #a bit of detail by not rounding the fifth decimal place, but that
    #shouldn't be a huge deal for this use.
    foreach my $c ( values %wcount ) {
        $c = int( ( $c / $tcount ) * 10_000 );
    }
    return \%wcount;
}

# PROGRESS METHODS -------------------------------------------------
sub page_progress {
    return $_[0]->{pcount};
}
sub word_progress {
    $_[0]->{total_words};
}


# INDEX SUPPORT ----------------------------------------------------

sub _update_keyword_index {
    my ( $self, $radd, $rdelete, $site ) = @_;

    my @words;
    {
        my %update = map { $_ => 1 } ( keys %{$radd}, keys %{$rdelete} );
        return 1 if !%update;    #no changes
        @words = sort keys %update;
    }
    $self->{total_words} = scalar @words;
    $self->{wcount} = 0;

    #get file path
    my $mdata = BigMed->bigmed->env('MOXIEDATA')
      or croak 'no moxiedata directory defined';
    my $search_dir = bm_file_path( $mdata, 'search' );
    bm_confirm_dir( $search_dir, { data => 1, build_path => 1 } ) or return;
    my $main_index =
      bm_untaint_filepath( bm_file_path( $search_dir, "site$site.cgi" ) )
      or return;
    my $backup = $main_index . '_bak';

    #create a lock file to avoid dupes
    my $LOCKFILE;
    my $lockpath = bm_untaint_filepath( bm_file_path( $search_dir, 'lock' ) )
      or return;
    sysopen( $LOCKFILE, $lockpath, O_WRONLY | O_TRUNC | O_CREAT, 0777 )
      or return $ERR->set_io_error( $LOCKFILE, 'open', $lockpath, $! );
    flock( $LOCKFILE, LOCK_EX )
      or return $ERR->set_io_error( $LOCKFILE, 'lock_ex', $lockpath, $! );
    print $LOCKFILE 'lock'
      or return $ERR->set_io_error( $LOCKFILE, 'write', $lockpath, $! );
    
    #open the main index file
    my $chmod = bm_datafile_chmod();
    my $DATAINDEX;
    sysopen( $DATAINDEX, $main_index, O_WRONLY | O_CREAT, $chmod )
      or return $ERR->set_io_error( $DATAINDEX, 'open', $main_index, $! );

    #copy to backup before flocking to avoid trouble in some windows servers
    bm_copy_file( $main_index, $backup )
      or return $ERR->set_io_error( $DATAINDEX, 'copy', $main_index, $! );
    
    flock( $DATAINDEX, LOCK_EX )
      or return $ERR->set_io_error( $DATAINDEX, 'lock_ex', $main_index, $! );

    #in perl 5.8, our words are utf8, so we need binmode.
    #in 5.6, our words are already converted to raw bytes elsewhere
    #in this module, so no need to do anything with them.
    binmode ($DATAINDEX, ":utf8") if $PERL_5_8;

    #use backup to read original data
    my $ORIG;
    sysopen( $ORIG, $backup, O_RDONLY )
      or return $ERR->set_io_error( $ORIG, "open", $backup, $! );
    flock( $ORIG, LOCK_SH )
      or return $ERR->set_io_error( $ORIG, 'lock_sh', $backup, $! );
    binmode ($ORIG, ":utf8") if $PERL_5_8;

    #truncate the main index file, start fresh
    truncate( $DATAINDEX, 0 )
      or return $ERR->set_io_error( $DATAINDEX, "truncate", $main_index, $! );

    my $next = shift @words;
    OLDWORDS: while (my $line = <$ORIG>) {
        #perl 5.6 goes nutty if you try to do this via regex capture:
        #    ($kw) = ($line =~ /\A(.+?):/ms);
        #have to use split instead, which is likely slower
        my $kw = ( split(/:/, $line) )[0] or next;
        while ($next && $next le $kw) { #we have a word to insert
            my ($rnew, $rdel) = ($radd->{$next}, $rdelete->{$next});
            my $kw_entry = $next lt $kw
              ? _build_flat_kw_entry( $next, q{}, $rnew, $rdel )      # new
              : _build_flat_kw_entry( $next, $line, $rnew, $rdel );   # exists

            if ($kw_entry) { #kw_entry is raw bytes for 5.6, unicode for 5.8+
                print $DATAINDEX $kw_entry, "\n"
                  or return _restore_backup($backup, $main_index);
            }
            if ($next eq $kw) { #get the next original keyword
                $next = shift @words;
                next OLDWORDS;
            }
            $next = shift @words;
        }
 
        print $DATAINDEX $line #add current word
          or return _restore_backup($backup, $main_index);

    }
    
    #no more original entries, append any additional lines
    close ($ORIG);
    unshift @words, $next if $next; #put the current word back into stack
    foreach $next (@words) {
        my ($rnew, $rdel) = ($radd->{$next}, $rdelete->{$next});
        my $kw_entry = _build_flat_kw_entry( $next, q{}, $rnew, $rdel )
          or next;
        print $DATAINDEX $kw_entry, "\n"
          or return _restore_backup($backup, $main_index);
    }
    
    #all done; delete backup and close index
    bm_delete_file($backup)
      or return $ERR->set_io_error( undef, 'unlink', $backup, $! );
    close($DATAINDEX)
      or return $ERR->set_io_error( undef, 'close', $main_index, $! );
    close($LOCKFILE)
      or return $ERR->set_io_error( undef, 'close', $lockpath, $! );
    return 1;
}

sub _restore_backup { #problem writing to main index
    my ($backup, $main_index) = @_;
    $ERR->set_io_error( undef, 'write', $main_index, $! );
    bm_move_file($backup, $main_index) if -e $backup;
    return;
}

sub _build_flat_kw_entry {
    my ( $word, $old_line, $radd, $rdelete ) = @_;
    $radd ||= {};
    my %pcount;
    if ($old_line) {
        chomp $old_line;
        my ($kw, $docs, $raw_count) = split /:/, $old_line;
        %pcount = map { split /-/ } ( split /!/, $raw_count );
    }
    while ( my ( $k, $v ) = each %{$radd} ) {
        $pcount{$k} = $v;
    }
    delete @pcount{ @{$rdelete} } if $rdelete;
    
    my @counts;
    while ( my ( $k, $v ) = each %pcount ) {
        push @counts, "$k-$v";
    }
    my $docs = @counts or return q{};
    return "$word:$docs:" . join('!', @counts);
}

sub _update_page_index {
    my ( $self, $page, $index_cache, $rcache ) = @_;
    my $pid = $page->id or $page->update_id or return;

    #collect existing pageindex, if any; otherwise initialize
    $index_cache ||= 'BigMed::Search::PageIndex';
    my $pindex =
      $index_cache->fetch(
        { site => $page->site, page => $pid } );
    if ( !$pindex ) {
        return if !defined $pindex;
        $pindex = BigMed::Search::PageIndex->new();
        $pindex->update_id or return;
        $pindex->set_page($pid);
        $pindex->set_site( $page->site );
    }

    #collect word tally and find any defunct words from existing
    my $rwcount = $self->count_page_words( $page, $rcache );
    my @delete = grep { !$rwcount->{$_} } $pindex->words;
    my @new;
    if ( $PERL_5_8 ) {
        @new = keys %{$rwcount};
    }
    else { #storing as bytes, convert back to unicode
        @new = map { pack "U0C*", unpack "C*", $_ } keys %{$rwcount};
    }
    $pindex->set_words( \@new );
    $pindex->save or return;

    return ( $rwcount, \@delete );
}

# TEXT/OBJECT PROCESSING SUPPORT -----------------------------------

sub richtext_to_text {
    my $text = shift;
    require BigMed::Filter;
    return html_to_text( BigMed::Filter->filter($text) );
}

sub html_to_text {
    my $text = shift;
    if ( !$text ) {
        $text = q{} if !defined $text;
        return $text;
    }

    $text =~ s/<!\-\-.*?\-\->//msg;    #kill comments

    #mild attempt to remove contents of bad containing tags
    $text =~ s/<\s*
      (script|style|select|head|textarea)
      [^>]*>
      .+?
      <\/\s*\1[^>]*>//xmsgi;
    $text =~ s/<[^>]+>//msg;                        #remaining tags
    $text =~ s{&(\#?[xX]?(?:[0-9a-fA-F]+|\w+));}{
        local $_ = $1;
        /^amp$/i      ? '&'
          : /^quot$/i ? '"'
          : /^gt$/i   ? '>'
          : /^lt$/i   ? '<'
          : /^#(\d+)$/ ? chr($1)
          : /^#x([0-9a-f]+)$/i ? chr( hex($1) )
          : $_
    }msgex;
    return $text;
}

sub author_to_text {
    my ( $page, $pcache ) = @_;
    my $author_text = $page->authors($pcache->{'BigMed::Pointer'}) or return q{};
    my @texts = ($author_text);
    foreach my $rpair ( $page->load_related_objects( 'author', $pcache ) ) {
        my %meta = $rpair->[0]->metadata;
        my $text = BigMed::Filter->filter( $meta{blurb} );
        push @texts, $text if $text;
    }
    return html_to_text( join( q{ }, @texts ) );
}

sub tags_to_text {
    my ( $page, $pcache ) = @_;
    my @tags =
      map { $_->[1]->name } ( $page->load_related_objects( 'tag', $pcache ) );
    return html_to_text( join( q{ }, @tags ) );
}

sub media_to_text {    #add media tally and caption
    my ( $page, $pcache ) = @_;
    my @texts;
    require BigMed::Filter;
    foreach my $rpair ( $page->load_related_objects( 'media', $pcache ) ) {
        my $text = $rpair->[1]->title || q{};
        push @texts, $text if $text;
        my %meta = $rpair->[0]->metadata;
        $text = BigMed::Filter->filter( $meta{caption} );
        push @texts, $text if $text;
    }
    return html_to_text( join( q{ }, @texts ) );
}

sub comment_to_text {
    my ( $page, $pcache ) = @_;
    require BigMed::Comment;
    my $comments =
      BigMed::Comment->select(
        { site => $page->site, page => $page->id, status => 'ok' } )
      or return;

    require BigMed::Filter;
    my ( $com, @texts );
    while ( $com = $comments->next ) {
        push @texts, $com->commenter if $com->commenter;
        push @texts, $com->safe_content;
    }
    return if !defined $com;
    return html_to_text( join( q{ }, @texts ) );
}

sub download_to_text {
    return _doc_to_text( 'download', @_ );
}

sub podcast_to_text {
    return _doc_to_text( 'podcast', @_ );
}

sub link_url_to_text {
    my ( $page, $pcache ) = @_;
    my @texts;
    foreach my $ext_url ( $page->load_related_objects( 'link_url', $pcache ) )
    {
        ( my $url = $ext_url->url || q{} ) =~ s/^https?//msi;
        push @texts, ( $ext_url->text || q{} ), $url;
    }
    return html_to_text( join( q{ }, @texts ) );
}

sub _doc_to_text {
    my ( $relation, $page, $pcache ) = @_;
    my @texts;
    foreach my $rpair ( $page->load_related_objects( $relation, $pcache ) ) {
        push @texts, ( $rpair->[1]->title || q{} ),
          ( $rpair->[1]->filename || q{} );
    }
    return html_to_text( join( q{ }, @texts ) );
}

1;
__END__

=head1 NAME

BigMed::Search::Indexer - Add/remove pages to the Big Medium search index

=head1 DESCRIPTION

BigMed::Search::Indexer handles all updates to the Big Medium search
index for the default search engine and is used internally by
BigMed::Search. BigMed::Search::Indexer can be subclassed if you're
creating a search plugin for a different search engine.

=head1 SUMMARY

    use BigMed::Search::Indexer;

    #constructor
    my $indexer = BigMed::Search::Indexer->new('en');

    #add or reindex pages
    $indexer->index_page($page_obj)
      or BigMed::Error->error_stop;    #i/o error
    $indexer->index_page($selection_of_pages)
      or BigMed::Error->error_stop;    #i/o error

    #remove pages
    $indexer->remove_page($page_obj)
      or BigMed::Error->error_stop;    #i/o error
    $indexer->remove_page($selection_of_pages)
      or BigMed::Error->error_stop;    #i/o error

=head1 CONSTRUCTOR

=head2 C<new>

    $indexer = BigMed::Search::Indexer->new($iso_locale);
    $indexer = BigMed::Search::Indexer->new() #locale default is 'en-us'

Returns a BigMed::Search::Indexer object. You can specify the ISO
locale in the first argument to tell the parser what language you're
indexing (this affects how the indexer handles word-stemming for supported
languages and what words, if any, are omitted from the index as stopwords).

=head1 INDEXING METHODS

=head2 C<index_page>

    $indexer->index_page($page_obj);
    $indexer->index_page($selection_of_pages);

Accepts either a page object or selection of page objects as the argument
and adds or reindexes the content of the page(s) to the search index.
Returns a true value on success or false on error (also adding an error
to the BigMed::Error queue).

Content is indexed with word stemming and stopwords as described via
BigMed::Search::WordParser's C<get_words> method.

Each word is scored as described in the C<count_page_words> method and stored
along with all other pages' scores for the word in a monolithic keyword
index file in the moxiedata/search directory. Each page's list of words is
additionally stored in a BigMed::Search::PageIndex record for fast lookup
by C<remove_page> and also to compare changes when reindexing the page later.

=head2 remove_page

    $indexer->remove_page($page);
    $indexer->remove_page($selection_of_pages);
    $indexer->remove_page( { site => $site, pages => \@page_ids } );

Accepts one of three types of arguments:

=over 4

=item * A page object

=item * A selection of page objects

=item * A hash argument with two key/value pairs:

=over 4

=item * site => site id or site object

=item * pages => array reference of one or more page ids

=back

=back

The method removes the page(s) indicated by the argument from the search
index. Returns a true value on success or false on error (also adding an
error to the BigMed::Error queue).

=head1 INDEX SUPPORT METHODS

=head2 C<count_page_words>

    my $rwords = $indexer->count_page_words($page_obj[, $pointer_cache]);

Returns a hash reference with all words in the page as keys and each word's
respective score as the corresponding values. The word list is generated
by BigMed::Search::WordParser and reflects the word-stemming and stopwords
available for the current locale.

The first argument is the object to index; the optional second argument is
a BigMed::Pointer selection to use to search for related objects, which
provides a performance benefit if provided.

Each word is counted according to its context (i.e., according to the
field or related object where it's found). In addition to this contextual
weighting of a word's raw score, the final score for the word is divided by
the total number of words in the page, so that longer documents don't
skew search results. This score is multipled by 10,000 and stored as an
integer, for a score range of 0 to 10,000.

By default, the method indexes the pages in the following fields and related
objects with the listed weight:

=over 4

=item * C<title> (weight: 1)

=item * C<description> (weight: 0.8)

=item * C<meta_description> (weight: 0.8)

=item * C<meta_keywords> (weight: 0.8)

=item * Tags (weight: 0.8)

=item * C<content> (weight: 0.6)

=item * Authors (weight: 0.6)

=item * Media, documents, audio/visual - Title and caption (weight: 0.4)

=item * Comments (weight: 0.2)

=back

Page subtypes also get their related objects indexed:

=over 4

=item * Link pages: Link text and URL (weight: 1)

=item * Document download pages: Document title and filename (weight: 1)

=item * Podcast pages: Pdocast title and filename (weight: 1)

=back

You can add or suppress fields and related objects via the C<register_field>
and C<register_relation> methods.

=head2 HTML Processors

These methods are used internally to convert html and rich text into
plain-text strings for indexing.

=head3 C<html_to_text>

    my $text = BigMed::Search::Indexer::html_to_text( $html );

Accepts a string of html and returns a new string stripped of HTML tags
and with basic and numeric HTML entities converted to their plain-text
equivalents.

=head3 C<richtext_to_text>

    my $text = BigMed::Search::Indexer::html_to_text( $richtext );

Accepts a rich-text string (e.g. the value of a page's C<content> or
C<description> field) and returns a plain-text equivalent.

=head2 Related Object Processors

These methods are used internally to gather a page's related objects
and return a plain-text string for indexing. The page object is the
first argument in all of these methods, and the optional second
argument is a selection of BigMed::Pointer objects to use to search
for related objects, offering a performance benefit.

=head3 C<author_to_text>

    my $text = BigMed::Search::Indexer::author_to_text( $page, $pointers );

Returns a plain-text string containing the text of the page's author names
and author blurbs.

=head3 C<tags_to_text>

    my $text = BigMed::Search::Indexer::tags_to_text( $page, $pointers );

Returns a plain-text string of all of the page's tags.

=head3 C<media_to_text>

    my $text = BigMed::Search::Indexer::media_to_text( $page, $pointers );

Returns a plain-text string of all titles and captions for the page's
media objects (images, documents and audio/video files).

=head3 C<comment_to_text>

    my $text = BigMed::Search::Indexer::comment_to_text( $page, $pointers );

Returns a plain-text string containing all of the page's visitor comments
and names of the comment contributors.

=head3 C<download_to_text>

    my $text = BigMed::Search::Indexer::download_to_text( $page, $pointers );

For document-download pages, returns a plain-text string containing the
title and filename of the page's download file.

=head3 C<podcast_to_text>

    my $text = BigMed::Search::Indexer::podcast_to_text( $page, $pointers );

For podcast pages, returns a plain-text string containing the
title and filename of the page's podcast.

=head3 C<link_url_to_text>

    my $text = BigMed::Search::Indexer::podcast_to_text( $page, $pointers );

For link pages, returns a plain-text string containing the link
text and url (with http/https removed).

=head1 FIELD AND RELATION REGISTRATION

These methods allow you to finetune the content that is indexed for each
page.

=head2 C<register_field>

    BigMed::Search::Indexer->register_field(
        {   name   => 'slug',
            weight => 0.8,
            parse => sub { return $_->[0] },
            subtype => 'article',
        },
        {   name   => 'meta_keywords',
            weight => 0,                 #stop indexing this field
        }
    );

Adds, removes or adjusts the BigMed::Content::Page columns to be indexed.
Accepts one or more hash references containing some or all of the following
fields:

=over 4

=item * C<name>: Required. The name of the column.

=item * C<parse>: Code reference to a routine to process the value of the
column. The routine receives the value of the column as its only argument
and should return a plain-text string containing no html. If adding a new
column, this parameter is required; if modifying or removing a column, it's
optional.

=item * C<weight>: The value from 0 to 1 to weight each word's score in
this field. If 0, the field will not be indexed at all.
If modifying an existing column, this field is optional; if not provided,
the weight will remain unchanged.

=item * C<subtype>: Optional. Specifies the subtype(s) for which this column
should be indexed; other columns will not have this column indexed. If this
parameter is not provided, all subtypes will index the column. The value
can be a string for a single subtype or an array reference of multiple
subtypes.

=back

=head3 C<register_relation>

    BigMed::Search::Indexer->register_relation(
        {   name   => 'related_url',
            weight => 0.2,
            parse => \&parsing_routine,
            subtype => 'article',
        },
        {   name   => 'comment',
            weight => 0,                 #stop indexing this field
        }
    );

Adds, removes or adjusts the relationship objects to be indexed.
Similar to C<register_field>, the method accepts one or more hash
references containing some or all of the following fields:

=over 4

=item * C<name>: Required. The name of the relationship to index (e.g.
author, related_url, comment, media, etc.).

=item * C<parse>: Code reference to a routine to fetch the relationship
text. The routine receives the page object in the first argument and,
optionally, a selection of BigMed::Pointer objects to use to search
for related objects. The routine should return a single plain-text string
containing all of the text to index for all of the related objects.

If adding a new relationship for indexing, this parameter is required;
if modifying or removing a relationship, it's optional.

=item * C<weight>: The value from 0 to 1 to weight each word's score in
this relationship. If 0, the relationship will not be indexed at all.
If modifying an existing column, this field is optional; if not provided,
the weight will remain unchanged.

=item * C<subtype>: Optional. Specifies the subtype(s) for which this
relationship should be indexed; other columns will not have this relationship
indexed. If this parameter is not provided, all subtypes will index the
relationship. The value can be a string for a single subtype or an array
reference of multiple subtypes.

=back

=head1 AUTHOR & COPYRIGHTS

This module and all Big Medium modules are copyright Josh Clark
and Global Moxie. All rights reserved.

Use of this module and the Big Medium content management system
are governed by Global Moxie's software licenses and may not be
used outside of the terms and conditions outlined there.

For more information, visit the Global Moxie website at
L<http://globalmoxie.com/>.

Big Medium and Global Moxie are service marks of Global Moxie
and Josh Clark. All rights reserved.

=cut

