# Copyright 2002-2008 Josh Clark and Global Moxie, LLC. This code cannot be
# redistributed without permission from globalmoxie.com.  For more
# information, consult your Big Medium license.
#
# $Id: Retriever.pm 3043 2008-03-31 14:00:38Z josh $

package BigMed::Search::Retriever;
use strict;
use warnings;
use utf8;
use Carp;
$Carp::Verbose = 1;

use Search::Dict;
use Fcntl qw(:DEFAULT :flock);
use BigMed::Search::WordParser;
use BigMed::Search::PageIndex;
use BigMed::Search::Result;
use BigMed::DiskUtil qw(bm_file_path bm_untaint_filepath);
use BigMed::Error;
my $ERR = 'BigMed::Error';

my $PERL_5_8 = ( $] >= 5.008 ); #determines utf-8 handling

# CONSTRUCTOR ------------------------------------------------------

sub new {
    my ( $class, $locale ) = @_;
    $locale = lc( $locale || 'en-us' );
    $locale = 'en-us' if $locale eq 'en';
    $locale =~ s/_/-/msg;

    my $self = bless {
        locale => $locale,
        parser => BigMed::Search::WordParser->new($locale)
    }, $class;
    return $self;
}

sub search {
    my ( $self, $site, $qstring, $rparam ) = @_;
    $rparam ||= {};
    my $sid;
    if ( ref $site ) {
        croak 'first search argument must be a site object or id'
          if ref $site ne 'BigMed::Site';
        $sid = $site->id;
    }
    else {
        $sid = $site;
    }
    croak 'no site id provided to search' if !$sid || $sid =~ /\D/;

    #collect the sorted matches and return the result object
    my $rpage_ids = $self->_gather_matches( $sid, $qstring ) or return;
    return BigMed::Search::Result->new(
        site    => $site,
        start   => $rparam->{start},
        results => $rparam->{results},
        pages   => $rpage_ids,
    );

}

sub parse_qstring {
    my ( $self, $qstring ) = @_;
    $qstring = q{} if !defined $qstring;
    $qstring =~ s/\A\s+//;
    $qstring =~ s/\s+\z//;
    if ( !$qstring ) {
        return $ERR->set_error(
            head => 'SEARCH_No search terms',
            text => 'SEARCH_TEXT_No search terms',
        );
    }
    my $rwords = $self->{parser}->get_words($qstring);
    if ( !@{$rwords} ) {
        return $ERR->set_error(
            head => 'SEARCH_No results',
            text => 'SEARCH_TEXT_No query',
        );
    }
    return $rwords;
}

sub _gather_matches {
    my ( $self, $sid, $qstring ) = @_;
    my $rwords = $self->parse_qstring($qstring) or return;
    
    if (!$PERL_5_8) { #matching against raw bytes
        foreach my $w ( @{$rwords} ) {
            $w = pack "C*", unpack "U0C*", $w;
        }
    }
    
    my $page_count;
    {    #gather overall page count for inverse doc frequency calculation
        my $all_page = BigMed::Search::PageIndex->select( { site => $sid } )
          or return;
        $page_count = $all_page->count;
    }

    my $mdata = BigMed->bigmed->env('MOXIEDATA')
      or croak 'no moxiedata directory defined';
    my $main_index =
      bm_untaint_filepath( bm_file_path( $mdata, 'search', "site$sid.cgi" ) )
      or return;
    return [] if !-e $main_index;

    my $DATAINDEX;
    sysopen( $DATAINDEX, $main_index, O_RDONLY )
      or return $ERR->set_io_error( $DATAINDEX, "open", $main_index, $! );
    flock( $DATAINDEX, LOCK_SH )
      or return $ERR->set_io_error( $DATAINDEX, "lock_sh", $main_index, $! );
    binmode( $DATAINDEX, ":utf8" ) if $PERL_5_8;

    my ( %tscore, %wmatch );
    foreach my $w ( @{$rwords} ) {
        next if ( look $DATAINDEX, "$w:" ) < 0;
        my $line = <$DATAINDEX>;
        if (!$PERL_5_8) { #manually convert the line to utf8
            $line = pack "U0C*", unpack "C*", $line;
        }

        next if index( $line, "$w:" ) != 0;
        chomp $line;
        my ( $kw, $docs, $raw_count ) = split /:/, $line;
        my %pages = map { split /-/ } ( split /!/, $raw_count );

        my $idf = log( $page_count / $docs ) / 10000;
        while ( my ( $pid, $score ) = each %pages ) {
            $wmatch{$pid} ||= 0;
            $tscore{$pid} ||= 0;
            $wmatch{$pid}++;
            $tscore{$pid} += ( $idf * $score );
        }
    }

    close($DATAINDEX);

    #sort the results by word match and then by score
    return [
        sort { $wmatch{$b} <=> $wmatch{$a} || $tscore{$b} <=> $tscore{$a} }
          keys %wmatch
    ];
}

1;
__END__

=head1 NAME

BigMed::Search::Retriever - Search pages in the Big Medium search index

=head1 SUMMARY

    #constructor
    my $retr = BigMed::Search::Retriever->new('en-us');
    
    #get BigMed::Search::Result object
    $result = $retr->search( $site_obj_or_id, 'search string' );

    #display results using result methods
    my $n = $result->num_results;
    if ($n) {
        my $start = $result->start_position;
        my $end   = $start + $n;
        print "$start to $end of ", $result->total_results,
          " total results.\n";
    }
    foreach my $rsummary ( $result->result_set ) {
        print $rsummary->{total}, ': ', $rsummary->{url}, "\n";
    }

=head1 DESCRIPTION

BigMed::Search::Retriever fetches search results for the default search
engine and is used internally by BigMed::Search. BigMed::Search::Retriever
can be subclassed if you're creating a search plugin for a different
search engine.

=head1 METHODS

=head2 C<new>

    $retr = BigMed::Search::Retriever->new($iso_locale);
    $retr = BigMed::Search::Retriever->new() #locale default is 'en-us'

Returns a BigMed::Search::Retriever object. You can specify the ISO
locale in the first argument to tell the parser what language you're
indexing (this affects how the retriever handles word-stemming for supported
languages and what words, if any, are omitted from the search).

=head2 C<search>

    $result = $retr->search( $site_obj_or_id, $query_string );
    my $n = $result->num_results;
    if ($n) {
        my $start = $result->start_position;
        my $end   = $start + $n;
        print "$start to $end of ", $result->total_results,
          " total results.\n";
    }
    foreach my $rsummary ( $result->result_set ) {
        print $rsummary->{total}, ': ', $rsummary->{url}, "\n";
    }

Returns a BigMed::Search::Result object with search results for the
site indicated in the first argument (id or object) with the query
string in the second argument.

Returns a false value on error, adding a message to the BigMed::Error
queue.

While the first argument can be either a site object or id, you will
get a modest performance increase by providing the object if you already
have one loaded. This saves BigMed::Search::Result from loading
the site and its section objects.

Although the search is an OR search, the results are sorted to give
AND matches the top result. Specifically, results that match the most
search terms come first and, within those results, the words with the
highest word scores. Word scores are weighted by the TF-IDF method
(term frequency / inverse document frequency).

=head2 C<parse_qstring>

    my $rwords = $retr->parse_qstring('string to search');

Used internally by C<search>, this method returns a reference to an
array of words to search. Supported language locales return the words
stemmed and stripped of any stopwords.

If no search words are found, returns undef and adds an error to the
BigMed::Error queue.

=head1 AUTHOR & COPYRIGHTS

This module and all Big Medium modules are copyright Josh Clark
and Global Moxie. All rights reserved.

Use of this module and the Big Medium content management system
are governed by Global Moxie's software licenses and may not be
used outside of the terms and conditions outlined there.

For more information, visit the Global Moxie website at
L<http://globalmoxie.com/>.

Big Medium and Global Moxie are service marks of Global Moxie
and Josh Clark. All rights reserved.

=cut

