# Copyright 2002-2008 Josh Clark and Global Moxie, LLC. This code cannot be
# redistributed without permission from globalmoxie.com.  For more
# information, consult your Big Medium license.
#
# $Id: NoBots.pm 3283 2008-08-26 08:17:21Z josh $

package BigMed::NoBots;
use strict;
use warnings;
use utf8;
use Carp;
$Carp::Verbose = 1;

use base 'Exporter';
our @EXPORT_OK = (
    'antispam_prefs',           'antispam_referrer_match',
    'set_captcha_getter',       'set_captcha_checker',
    'get_captcha_html',         'clear_captcha_html',
    'confirm_captcha_response', 'antispam_display_info',
    'query_antispam_fields',    'is_form_fresh',
    'register_antispam_prefs',
);

use BigMed;
use BigMed::Prefs;
use BigMed::MD5 qw(md5_hex);
use Time::Local;
use HTML::Template;
use BigMed::DiskUtil qw(bm_file_path bm_write_file bm_delete_file);
use English qw( -no_match_vars );

my $MAXIMUM_MINUTES = 18 * 60;   #discard if timestamp is older than 18 hours
my $MINIMUM_SECONDS = 10;        #discard if response younger than this
my $TIMEOUT_MINUTES = 30;        #reprompt if more than this but less than max

my %ANTISPAM_PREFS = (
    'antispam_caption' => {
        default     => 'This question helps prevent spam:',
        edit_type   => 'rich_text_inline',
        edit_params =>
          { description => 'PREFS_ANTISPAM_DESC_caption' },
        priority => 95,
        sitewide => 1,
    },
    'antispam_challenge' => {
        default     => 'What color is the sky on a clear day?',
        edit_type   => 'simple_text',
        edit_params =>
          { description => 'PREFS_ANTISPAM_DESC_challenge', required => 1 },
        priority => 90,
        sitewide => 1,
    },
    'antispam_response' => {
        default     => 'blue',
        edit_type   => 'simple_text',
        edit_params =>
          { description => 'PREFS_ANTISPAM_DESC_response', required => 1 },
        priority => 85,
        sitewide => 1,
    },
);
my $GET_CAPTCHA_HTML = \&_confirm_challenge_include;
my $CONFIRM_CAPTCHA  = \&_confirm_challenge_response;
my $CLEAR_CAPTCHA    = \&_remove_challenge_include;

sub antispam_prefs {
    return %ANTISPAM_PREFS;
}

sub register_antispam_prefs {
    return if BigMed::Prefs->pref_exists('antispam_challenge');
    foreach my $pref_name ( keys %ANTISPAM_PREFS ) {
        BigMed::Prefs->register_pref( $pref_name,
            $ANTISPAM_PREFS{$pref_name} );
    }
    return;
}

sub antispam_referrer_match {
    my ( $self, $page, $suffix ) = @_;
    $suffix ||= q{};
    my $referrer = $ENV{HTTP_REFERER} or return 1;
    my $subtype = $page->subtype || q{};
    my $slug = $subtype eq 'section' ? 'index' : ( $page->slug || 'index' );
    my $file = "$slug$suffix.shtml";
    return $referrer =~ /\Q$file\E(\?|\z)/ms;
}

sub set_captcha_getter {
    $GET_CAPTCHA_HTML = $_[1];
    return;
}

sub set_captcha_checker {
    $CONFIRM_CAPTCHA = $_[1];
    return;
}

sub set_captcha_clearer {
    $CLEAR_CAPTCHA = $_[1];
    return;
}

sub get_captcha_html {
    return $GET_CAPTCHA_HTML->(@_);
}

sub confirm_captcha_response {
    return $CONFIRM_CAPTCHA->(@_);
}

sub clear_captcha_html {
    return $CLEAR_CAPTCHA->(@_);
}

sub _antispam_secret {
    my ($page) = @_;
    return BigMed->bigmed->env('BMSALT')
      . ( $page->slug || 'index' )
      . $page->create_time;
}

sub _confirm_challenge_include {
    my ( $self, $site, $page ) = @_;    #page not used, but available via api
    my $dot = BigMed->bigmed->env('DOT');
    my $file = $site->html_dir . "/bm${dot}challenge.txt";
    if ( !-e $file ) {
        register_antispam_prefs();
        my $mdata      = BigMed->bigmed->env('MOXIEDATA');
        my @tmpl_paths = (
            bm_file_path( $mdata, qw(templates_custom site_templates HTML) ),
            bm_file_path( $mdata, qw(templates site_templates HTML) ),
        );
        my $tmpl = HTML::Template->new(
            filename          => 'wi_challenge.tmpl',
            path              => \@tmpl_paths,
            die_on_bad_params => 0,
            cache             => 1,
            loop_context_vars => 1
        );
        my $doctype = $site->get_pref_value('html_htmlhead_doctype');
        my $caption = $site->get_pref_value('antispam_caption');
        $tmpl->param(
            challenge => $site->get_pref_value('antispam_challenge'),
            caption => $caption,
            close     => ( $doctype =~ /xhtml/msi ? q{ /} : q{} ),
        );
        my $html = $tmpl->output;
        bm_write_file( $file, $html, { build_path => 1 } ) or return;
    }

    #return location relative to web root
    my $pagedir = $site->html_url;
    $pagedir =~ s{^https?://[^/]*}{}msi;
    return qq{<!--#include virtual="$pagedir/bm${dot}challenge.txt" -->};
}

sub _remove_challenge_include {
    my ( $self, $site, $page ) = @_;    #page not used, but available via api
    my $dot = BigMed->bigmed->env('DOT');
    my $file = $site->html_dir . "/bm${dot}challenge.txt";
    return bm_delete_file($file);
}

sub _confirm_challenge_response {
    my ( $self, $site, $page ) = @_;    #page not used, but available via api

    #expects self to be a bigmed::app::web object
    my %field = $self->parse_submission(
        {   id       => 'BMF_CHALLENGE',
            parse_as => 'simple_text',
        }
    );
    register_antispam_prefs();
    my $expected = lc( $site->get_pref_value('antispam_response') );
    my $response = $field{BMF_CHALLENGE};

    if ( !defined $response || lc $response ne $expected ) {
        my $label = $site->get_pref_value('antispam_challenge');
        return $self->set_error(
            head => ['NOBOTS_Incorrect response',      $label],
            text => ['NOBOTS_TEXT_Incorrect response', $label],
        );
    }
    return 1;
}

sub antispam_display_info {
    my ( $self, $page, $rfields, $rparam ) = @_;
    $rparam ||= {};
    my $secret = _antispam_secret($page);

    my %info;
    $info{set_time} = q{<!--#config timefmt="%S%j%m%M%Y%H%d" -->};
    my ( $local_time, $fsuffix );
    if ( $rparam->{no_include} ) {
        $info{local_time} = _server_timestamp();
        $fsuffix = $info{local_time} if index( $OSNAME, 'MSWin' ) < 0;
    }
    elsif ( index( $OSNAME, 'MSWin' ) < 0 ) {    #not win32
        $info{set_time}
          .= '<!--#set var="BMDATE_LOCAL" value="$DATE_LOCAL" -->';
        $fsuffix = $info{local_time} = '<!--#echo var="BMDATE_LOCAL"-->';
    }
    else {    #win32; can't handle ssi variables, drop timestamp suffix
        $info{local_time} = '<!--#echo var="DATE_LOCAL" -->';
        $fsuffix = q{};
    }

    $info{tstamp}   = 'f' . md5_hex("tstamp$secret");
    $info{realname} =
      { map { $_ => 'f' . md5_hex("$_$secret") . $fsuffix } @{$rfields} };
    $info{fakename} =
      { map { $_ => 'f' . md5_hex("foo$_$secret") . $fsuffix } @{$rfields} };

    return %info;
}

sub _server_timestamp {
    my @time = localtime;
    $time[4]++;    #month
    my $year = $time[5] + 1900;
    my $ydays = sprintf( '%03d', $time[7] );
    my ( $sec, $min, $hours, $mday, $mon ) =
      map { sprintf( '%02d', $_ ) } @time[0 .. 4];
    return "$sec$ydays$mon$min$year$hours$mday";
}

sub query_antispam_fields {
    my ( $self, $page, $rparam ) = @_;
    $rparam ||= {};
    my $secret = _antispam_secret($page);

    #check timestamp for range
    my $min = $rparam->{min_seconds};
    $min = $MINIMUM_SECONDS if !defined $min;
    my $max = $rparam->{max_minutes};
    $max = $MAXIMUM_MINUTES if !defined $max;
    my ( $tstamp, $epoch ) = _query_timestamp_and_epoch( $self, $page );
    my $age_secs = time - $epoch;
    return if !$tstamp || $age_secs < $min || ( $age_secs / 60 ) > $max;

    #check the honeypots and gather field names
    my $suffix = index( $OSNAME, 'MSWin' ) < 0 ? $tstamp : q{};
    my %fieldname;
    my @fields = $rparam->{fields} ? @{ $rparam->{fields} } : ();
    foreach my $field (@fields) {
        my $fieldname = 'f' . md5_hex("$field$secret") . $suffix;
        return
          if $self->utf8_param( 'f' . md5_hex("foo$field$secret") . $suffix )
          || !defined $self->utf8_param($fieldname);
        $fieldname{$field} = $fieldname;
    }
    return \%fieldname;
}

sub _query_timestamp_and_epoch {
    my ( $self, $page ) = @_;
    my $secret = _antispam_secret($page);
    my $tstamp = $self->utf8_param( 'f' . md5_hex("tstamp$secret") ) || q{};
    $tstamp =~ s/\D//msg;
    return if !$tstamp || length $tstamp != 17;    #bad field, it's a bot
    my ( $sec, $ydays, $mon, $min, $year, $hours, $mday ) = (
        substr( $tstamp, 0,  2 ),
        substr( $tstamp, 2,  3 ),
        substr( $tstamp, 5,  2 ),
        substr( $tstamp, 7,  2 ),
        substr( $tstamp, 9,  4 ),
        substr( $tstamp, 13, 2 ),
        substr( $tstamp, 15, 2 ),
    );
    $mon--;
    $year = $year - 1900;
    my $epoch = timelocal( $sec, $min, $hours, $mday, $mon, $year );
    return ( $tstamp, $epoch );
}

sub is_form_fresh {
    my ( $self, $page, $limit_minutes ) = @_;
    my ( $tstamp, $epoch ) = _query_timestamp_and_epoch( $self, $page );
    return if !$tstamp;
    $limit_minutes ||= $TIMEOUT_MINUTES;
    my $minute_age = ( time - $epoch ) / 60;
    return ( $minute_age <= $limit_minutes );
}

1;

__END__

=head1 Name

=head2 BigMed::NoBots

Provides several methods to detect and deter spambots submitting
to public forms.

=head1 Synopsis

    #to generate a bot-repellant form
    package MyApp;
    use BigMed::NoBots qw(get_captcha_html antispam_display_info);

    #get alternate fieldnames, as well as fake names for honeypot fields
    my %display_info =
      MyApp->antispam_display_info( $page_obj, [qw(email comment submit)] );

    #fieldnames
    my %realname = %{ $display_info{realname} };
    my %fakename = %{ $display_info{fakename} };
    
    #timestamp info
    my $timestamp_field = $display_info{tstamp};
    my $time_setter = $display_info{set_time};
    my $local_time = $display_info{local_time};

    #get a captcha challenge field
    $captcha_html = MyApp->get_captcha_html($site_obj, $page_obj);

    #build the fields with this info    
    my $field_html = qq{
        <!-- set timestamp -->
        $time_setter
        <input name="$timestamp_field" value="$local_time" type="hidden" />
    
        <!-- real fields -->
        <p><label>
            E-mail
            <input name="$realname{email}" type="text" />
        </label></p>
        
        <p><label>
            Comment
            <textarea name="$realname{comment}" rows="4" cols="20"></textarea>
        </label></p>
        
        <!-- captcha challenge -->
        $captcha_html
        
        <p>
            <input name="$realname{submit}" type="submit" />
        </p>
        
        <!--fake fields; if value submitted, it's a bot -->
         <div style="display:none">
            Leave these fields blank
            <input name="$fakename{email}" type="text" />
            <textarea name="$fakename{comment}" rows="1" cols="1"></textarea>
        </div>
    };

    #to parse a bot-repellant form
    package MyApp;
    use base qw(BigMed::App::Web);
    use BigMed::NoBots qw(antispam_referrer_match confirm_captcha_response
                          query_antispam_fields   is_form_fresh);

    #make sure the form comes from the expected page
    if ( $myapp->antispam_referrer_match($page_obj) ) {
        print "Not submitted from correct page.";
        exit;
    }

    #get the alternate field names for this form (and along the way
    #make sure that the timestamp is valid and that non honeypot
    #fields were supplied).

    my $rfield =
      $myapp->query_antispam_fields( $page_obj,
        { fields => [qw(email comment)] } );
    if ( !$rfield ) {
        print "You're a bot!";
        exit;
    }
    my $email_fieldname   = $rfield->{email};
    my $comment_fieldname = $rfield->{comment};

    #check the captcha response
    if ( $myapp->confirm_captcha_response( $site_obj, $page_obj ) ) {
        print "Did not receive the correct captcha response.";
        exit;
    }

    #is the form older than 30 minutes?
    if ( !$app->is_form_fresh($page) ) {    #form is older than 30 minutes
        print "Your form expired, please submit again.";
        exit;
    }

    #read the fields
    my $value = $myapp->parse_submission(
        {   id       => $email_fieldname,
            parse_as => 'email',
        },
        {   id       => $comment_fieldname,
            parse_as => 'raw_text',
        },

    );
    my $email   = $value{$email_fieldname};
    my $comment = $value{$comment_fieldname};

=head1 Description

This module exports several methods to make spambot form submissions easier
to identify and discard. There are four elements to this approach:

=over 4

=item 1. Obfuscated field names

Field names/ids are hashed with a secret and concatenated with an obfuscated
timestamp, so that the field names change with every page request. If the
correct field names are not provided, it's almost certainly a spambot.

=item 2. Timestamp

An obfuscated timestamp is included as a hidden field in the form. If the
timestamp is too old, it's almost certainly a spambot.

=item 3. Honeypots

Fake field names are provided and are intended to be used for text input
fields that are hidden from human users (via C<style="display:none">, for
example). If a value is submitted in these fields, it's almost certainly
a spambot.

=item 4. Page referrer

The referrer is checked and, if provided, is used to make sure that the
form has been submitted from the expected page.

=item 5. CAPTCHA

A CAPTCHA challenge is provided. The default CAPTCHA is a simple, configurable
challenge question, but it's pluggable and can be overridden with something
more complex.

=back

=head1 Field Name Methods

=head2 C<< $self->antispam_display_info($page_obj, \@fields, \%param) >>

    my %display_info =
      MyApp->antispam_display_info( $page_obj, [qw(email comment submit)] );

    my %realname = %{ $display_info{realname} };
    my %fakename = %{ $display_info{fakename} };
    my $timestamp_field = $display_info{tstamp};
    my $time_setter = $display_info{set_time};
    my $local_time = $display_info{local_time};

    my $field_html = qq{
        <!-- set timestamp -->
        $time_setter
        <input name="$timestamp_field" value="$local_time" type="hidden" />
    
        <!-- real fields -->
        <p><label>
            E-mail
            <input name="$realname{email}" type="text" />
        </label></p>
        
        <p><label>
            Comment
            <textarea name="$realname{comment}" rows="4" cols="20"></textarea>
        </label></p>
        <p>
            <input name="$realname{submit}" type="submit" />
        </p>
        
        <!--fake fields; if value submitted, it's a bot -->
         <div style="display:none">
            Leave these fields blank
            <input name="$fakename{email}" type="text" />
            <textarea name="$fakename{comment}" rows="1" cols="1"></textarea>
        </div>
    };

Returns a hash with these key/value pairs:

=over 4

=item * set_time => $ssi_tag

HTML to include at the beginning of the form to set the SSI variable
used in timestamp value and form names.

=item * tstamp => $timestamp_fieldname

The name to use for the hidden timestamp field.

=item * local_time => $timestamp_value

The value to use for the hidden timestamp field.

=item * realname => \%fieldnames

A reference to hash whose keys are the original field names and whose
values are the obfuscated names to use.

=item * fakename => \%fakenames

A reference to hash whose keys are the origial field names and whose
values are the names to use for the honeypot fields which, if provided,
are an indicator that the form has been submitted by a spambot.

=back

The field accepts three arguments: 1) the page object on which the form
appears, 2) a reference to an array of field names to obfuscate,
3) an optional hashref of parameters with the following option:

=over 4

=item * no_include => 1

By default, the routine returns timestamp and field values intended to be
used in a SSI (.shtml) page. If you are displaying these fields as part
of a dynamic cgi, the C<no_include> flag indicates that the page will
not be processed for SSI and will instead return appropriate values for
a CGI-displayed form.

=back

=head2 C<<$self->query_antispam_fields($page_obj, \%param)>>

Checks the fields to make sure that all of the corresponding obfuscated
fieldnames were submitted and that no values were provided to any of the
honeypot fields. In addition, checks to make sure that the timestamp field
is present and that the value is within the specified age range (defaults
to range of 10 seconds to 18 hours).

If these conditions are satisfied, the routine returns a hash reference
whose keys are the original field names and whose values are the actual,
obfuscated names. If the conditions are not satisfied, the routine returns
undef, and the submission should be discarded.

    my $rfield =
      $self->query_antispam_fields( $page_obj,
        { fields => [qw(email comment)] } );
    if ( !$rfield ) {
        print "You're a bot!";
        exit;
    }
    my $email_fieldname   = $rfield->{email};
    my $comment_fieldname = $rfield->{comment};

The method accepts two arguments: 1) the page object on which the form
appeared, and 2) A hash reference of parameters:

=over 4

=item * fields => \@field_names

The original, unobfuscated field names.

If any of the corresponding obfuscated field names were not submitted,
the method will decide that this is a bot submission, so go carefully here:
If your form contains multiple submit buttons, you should not include
those submit fields here; same with checkbox fields. Including those
elements can give you a false positive.

=item * min_seconds => $seconds

Default is 10. If the timestamp of the submitted form is younger than
this (or in the future), the method flags it as a spambot.

=item * max_minutes => $minutes

Default is 18 * 60 (18 hours). If the timestamp of the submitted form is
older than this, the method flags it as a spambot.

=back

=head1 CAPTCHA Methods

=head2 C<< $self->get_captcha_html( $site_obj, $page_obj ) >>

Returns HTML for the CAPTCHA field, to be included in the form.

The default is a simple challenge and response question (the question
is stored in the site object's antispam_challenge preference and the
answer is in the antispam_response preference). This HTML is a SSI
include tag intended to be displayed in a SSI (.shtml) page; if you're
including it in a dynamic CGI page, you should run the page through the
BigMed::PageUtils C<replace_all_includes> method.

This default response be overrided by providing a callback to the
C<set_captcha_getter> method.

=head2 C<< $self->confirm_captcha_response( $site_obj, $page_obj ) >>

Checks to make sure that a correct response has been provided to the
field provided by the C<get_captcha_html> method. The default method
requres that the $self object be a BigMed::App::Web object.

The default behavior of this method may be overridden by providing
a callback to the C<set_captcha_checker> method.

=head2 C<< $self->clear_captcha_html( $site_obj, $page_obj ) >>

Removes captcha_html from the site. Under the default settings, this
deletes the existing include file containing the captcha HTML.

This default response may be overridden by providing a callback to the
C<set_captcha_clearer> method.

=head2 C<< $self->set_captcha_getter( \&callback ) >>

Overrides the default C<get_captcha_html> method, using the callback
argument as the new routine. The routine receives the calling object/class,
site object and page object as arguments and should return a HTML
string to include in the form.

=head2 C<< $self->set_captcha_checker( \&callback ) >>

Overrides the default C<confirm_captcha_response> method, using the callback
argument as the new routine. The routine receives the calling object/class,
site object and page object as arguments and should return true if the
CAPTCHA has been correctly answered, otherwise false.

=head2 C<< $self->set_captcha_clearer( \&callback ) >>

Overrides the default C<clear_captcha_html> method, using the callback
argument as the new routine. The routine receives the calling object/class,
site object and page object as arguments and should return true on success,
otherwise false.

=head1 Other Checks

=head2 C<< $self->antispam_referrer_match($page_obj, $suffix) >>

Returns true if the referrer indicates that the form was submitted from
the page object (or, more specifically, that it was submitted from a
.shtml page ending in the page object's slug).

The optional second argument indicates that the expected referring
page has a slug suffix. For example, to ensure that a field is submitted
from a page's email page:

    my $dot = BigMed->bigmed->env('DOT');
    if (!$self->antispam_referrer_match($page, "${dot}email")) {
        print "Bad referrer!";
        exit;
    }

Because not all browsers provide referrer information, an absence of 
a referrer url cannot be assumed to be a spambot. In that case, the
test is skipped and returns true.

=head2 C<< $self->is_form_fresh($page_obj, $max_minutes) >>

Returns true if the age of the form's timestamp value is younger than
the number of minutes specified in the second argument (default is 30
minutes).

This is a separate time check than the one described in the
C<query_antispam_fields> method. That check is intended to winnow out
very old forms. This one is for relatively younger forms that still
might have the possibility of being real submissions. It's intended
to be used to prompt users to review their submission and submit
again to make sure that they are indeed human.

=head1 Antispam preferences

The module handles the Big Medium installation's CAPTCHA anti-spam
preferences and allows external code to request this preference
definition so that they can register the preferences when needed.
(BigMed::Format::HTML uses this to register the preferences as
HTML preferences, for example.)

=head2 C<< $self->antispam_prefs >>

Returns a hash of preference definitions, which may be registered like so:

    use BigMed::Prefs;
    my %pref = $self->antispam_prefs;
    foreach my $name ( keys %pref ) {
        BigMed::Prefs->register_pref( $name, $pref{$name} );
    }

=head2 C<< register_antispam_prefs() >>

Registers the antispam preferences.

=head1 Author & Copyrights

This module and all Big Medium modules are copyright Josh Clark
and Global Moxie. All rights reserved.

Use of this module and the Big Medium content
management system are governed by Global Moxie's software licenses
and may not be used outside of the terms and conditions outlined
there.

For more information, visit the Global Moxie website at
L<http://globalmoxie.com/>.

Big Medium and Global Moxie are service marks of Global Moxie
and Josh Clark. All rights reserved.

