# Copyright 2002-2008 Josh Clark and Global Moxie, LLC. This code cannot be 
# redistributed without permission from globalmoxie.com.  For more
# information, consult your Big Medium license.
#
# $Id: Filter.pm 3043 2008-03-31 14:00:38Z josh $

package BigMed::Filter;
use strict;
use warnings;
use utf8;
use Carp;
$Carp::Verbose = 1;
use BigMed::Plugin;
use BigMed::Trigger;

my %filter;
my $filters_loaded;

sub register_filter {
    my $self    = shift;
    my %options = @_;
    my $label   = $options{label}
      or croak 'no label; register_filter requires labels for all filters';
    croak "Filter labels cannot contain spaces ('$label')" if $label =~ /\s/;

    #see if there's been a filter already registered if none supplied
    $options{filter} ||= $filter{$label} ? $filter{$label}->{filter} : undef;
    if ( ref $options{filter} ne 'CODE' ) {
        croak "no coderef supplied for filter '$label'";
    }

    #use pre-existing pre-prompt if none supplied
    $options{pre_prompt} ||= $filter{$label}->{pre_prompt} if $filter{$label};
    $options{pre_prompt} = undef if ref $options{pre_prompt} ne 'CODE';

    #supplement any existing prompt coderefs with the supplied coderefs
    my %prompt = $filter{$label} ? %{ $filter{$label}->{prompt} } : ();
    if ( ref $options{prompt} eq 'HASH' ) {
        my %new_prompt = %{ $options{prompt} };
        foreach my $class ( keys %new_prompt ) {
            next if ref $new_prompt{$class} ne 'CODE';
            $prompt{$class} = $new_prompt{$class};
        }
    }

    $filter{$label} = {
        filter        => $options{filter},
        prompt        => \%prompt,
        pre_prompt    => $options{pre_prompt},
        browser_check => $options{browser_check},
    };
    return 1;
}

sub remove_filter {
    my $self = shift;
    delete @filter{@_};
}

###########################################################
# REGISTER THE DEFAULT BASELINE FILTER: RAW HTML
###########################################################

BigMed::Filter->register_filter(
    label  => 'RawHTML',
    filter => sub { $_[0] },
    pre_prompt => sub { ('RawHTML', $_[0]->escape( $_[1] ) ) },
);

###########################################################
# ACCESSORS
###########################################################

sub _load_filters {
    return 1 if $filters_loaded;
    BigMed::Plugin->load_content_filters();
    $filters_loaded = 1;
}

sub browser_supports {
    my ( $class, $app, $filter ) = @_;
    my $rbrowser_check = $filter{$filter}->{browser_check};
    return $rbrowser_check ? $rbrowser_check->($app) : 1;
}

sub browser_filter {
    my ( $class, $app, $filter ) = @_;
    if ( $class->filter_exists($filter) && $class->browser_supports($app,$filter) ) {
        return $filter;
    }
    else {
        return 'RawHTML';
    }
}

sub all_filters {
    _load_filters();
    my @filters =
      sort { lc $a cmp lc $b }
      grep { $_ ne 'RichText' && $_ ne 'RawHTML' }
      keys %filter;

    #rich text is always first and rawhtml is always last
    unshift @filters, 'RichText'
      if BigMed::Filter->filter_exists('RichText');
    push @filters, 'RawHTML'
      if BigMed::Filter->filter_exists('RawHTML');
    @filters;
}

sub filter_exists {
    my $label = $_[1] or return undef;
    _load_filters();
    exists $filter{$label};
}

sub filter {
    my $class     = shift;
    my $raw_value = shift;
    my ( $filter, $text ) = $class->extract_filter_and_text($raw_value);
    return '' if !$filter || !$class->filter_exists($filter);
    my $filtered = $filter{$filter}->{filter}->($text);
    $filtered =~ s/\A\s+//;
    $filtered =~ s/\s+\z//;
    BigMed::Filter->call_trigger('post_filter', \$filtered, $class);
    $filtered;
}

sub prompt_coderef {
    my $class        = shift;
    my $label        = shift;
    my $prompt_class = shift;
    return undef
      if !$label || !$prompt_class || !$class->filter_exists($label);
    return $filter{$label}->{prompt}->{$prompt_class};
}

###########################################################
# FILTER JUGGLING
###########################################################

sub preprompt_text_and_filter {
    my ( $class, $app, $raw_value ) = @_;
    my ( $current_filter, $text ) =
      $class->extract_filter_and_text($raw_value);
    my $use_filter = '';
    while ( $use_filter ne $current_filter ) {
        $current_filter = $use_filter if $use_filter;
        ( $use_filter, $text ) =
            $filter{$current_filter}->{pre_prompt}
          ? $filter{$current_filter}->{pre_prompt}->( $app, $text )
          : ( $current_filter, $text );
        $use_filter = $class->browser_filter($app, $use_filter);
    }
    return ( $text, $current_filter );
}

sub extract_filter_and_text {
    my $class = shift;
    my $raw_value = shift || '';
    _load_filters();
    my ( $filter, $text );
    if ( $raw_value =~ /\A([a-zA-Z0-9]+):(.*)\z/xms ) {
        $text = $2 || '';
        $filter = $class->filter_exists($1) ? $1 : 'RawHTML';
    }
    else {
        $text = $raw_value;
        $filter = $class->filter_exists('RichText') ? 'RichText' : 'RawHTML';
    }
    return ( $filter, $text );
}

###########################################################
# HANDY HELPERS
###########################################################

my $blocks = qr/ p | div | h[1-6] | blockquote | pre | table | dl | ol | ul
  | script | noscript | form | fieldset | iframe | math | ins | del/x;


# inline_single_graf()
# strips a single set of <p> tags off of simple solo paragraphs:
# '<p>foo</p>' becomes just 'foo'
# but '<p>foo</p><p>bar</p>' is untouched; respects other blocks too

sub inline_single_graf {
    my ($class, $text ) = @_;
    return q{} if !defined $text;
    
    #clear leading empty grafs
    $text =~ s{\A\s*<p[^>]*>\s*(&nbsp;|&#160;)*\s*</p>}{}imsg;
    
    #strip out <p>...</p> from single-graf tags
    $text =~ s{ \A \s*              #start of string, forgiving of lead space
                <p[^>]*>            #p leads string
                (
                [^<]*               # all safe content
                (?: (?!</?$blocks>) # can't match open or close tag
    
                <[^<]* )*           # match remaining safe stuff
                )
                </p>                # the closing tag
                \s*                 # space or line break
                \z                  #end of string
                }{$1}imsxo;
    return $text;
}

###########################################################
# SANITIZE PUBLIC SUBMISSIONS
###########################################################

my @ALLOWED_ELEMENTS = (
    'a',    'abbr',       'acronym', 'address',
    'b',    'blockquote', 'br',      'cite',
    'code', 'dd',         'del',     'dfn',
    'dl',   'dt',         'em',      'h1',
    'h2',   'h3',         'h4',      'h5',
    'h6',   'hr',         'i',       'img',
    'ins',  'kbd',        'li',      'ol',
    'p',    'pre',        'q',       's',
    'samp', 'span',       'strike',  'strong',
    'sub',  'sup',        'tt',      'u',
    'ul',   'var'
);
my %TRANSLATE_ELEMENTS = (
    'h1' => 'strong',
    'h2' => 'strong',
    'h3' => 'strong',
    'h4' => 'strong',
    'h5' => 'strong',
    'h6' => 'strong',
);
my $ALLOWED_ELEM_REGEX = join( '|', @ALLOWED_ELEMENTS );


my %ALLOWED_ATTRIBUTES = (
    'align'          => 1,
    'alt'            => 1,
    'border'         => 1,
    'charset'        => 1,
    'cite'           => 1,
    'clear'          => 1,
    'dir'            => 1,
    'height'         => 1,
    'href'           => 1,
    'hreflang'       => 1,
    'hspace'         => 1,
    'longdesc'       => 1,
    'noshade'        => 1,
    'span'           => 1,
    'src'            => 1,
    'start'          => 1,
    'style'          => 1,
    'target'         => 1,
    'title'          => 1,
    'type'           => 1,
    'valign'         => 1,
    'vspace'         => 1,
    'width'          => 1,
);

my %ATTRIBUTE_URL_VALUE = ( 'href' => 1, 'src' => 1, 'cite' => 1 );

my %ALLOWED_CSS_PROPERTIES = (
    'direction'           => 1,
    'display'             => 1,
    'font-style'          => 1,
    'font-weight'         => 1,
    'height'              => 1,
    'overflow'            => 1,
    'text-align'          => 1,
    'text-decoration'     => 1,
    'text-indent'         => 1,
    'width'               => 1
);

my %ALLOWED_PROTOCOLS = (
    'ed2k'   => 1,
    'ftp'    => 1,
    'http'   => 1,
    'https'  => 1,
    'irc'    => 1,
    'mailto' => 1,
    'news'   => 1,
    'gopher' => 1,
    'nntp'   => 1,
    'telnet' => 1,
    'webcal' => 1,
    'xmpp'   => 1,
    'callto' => 1,
    'feed'   => 1,
    'urn'    => 1,
    'aim'    => 1,
    'rsync'  => 1,
    'tag'    => 1,
    'ssh'    => 1,
    'sftp'   => 1,
    'rtsp'   => 1,
    'afs'    => 1
);

sub allowed_public_elements { return @ALLOWED_ELEMENTS }
sub translated_public_elements { return %TRANSLATE_ELEMENTS }
sub allowed_public_attributes { return sort keys %ALLOWED_ATTRIBUTES }
sub allowed_public_css { return sort keys %ALLOWED_CSS_PROPERTIES }
sub allowed_public_protocols { return sort keys %ALLOWED_PROTOCOLS }

sub set_allowed_public_elements {
    @ALLOWED_ELEMENTS = map { lc $_ } @_;
    $ALLOWED_ELEM_REGEX = join( '|', @ALLOWED_ELEMENTS );
}
sub set_translated_public_elements {
    %TRANSLATE_ELEMENTS = @_;
}
sub set_allowed_public_attributes {
    %ALLOWED_ATTRIBUTES = map { ( lc $_ ) => 1} @_;
}
sub set_allowed_public_css {
    %ALLOWED_CSS_PROPERTIES = map { ( lc $_ ) => 1} @_;
}
sub set_allowed_public_protocols {
    %ALLOWED_PROTOCOLS = map { ( lc $_ ) => 1} @_;
}

sub sanitize {
    my ( $class, $text ) = @_;
    return q{} if !defined $text;

    #lop off any elements that include bad characters
    1 while (
        $text =~ s{
            <\s*
            (/? \s* [a-zA-Z0-9]+)(?![\s>])[^a-zA-Z0-9>][^\s>]*
        ([^>]*)(>|\z)
        }{<$1$2>}msgx
    );
    
    #remove bad tags
    1 while (
        $text =~ s{
        <\s*
        (?!                                 #anything other than...
          /? \s* ($ALLOWED_ELEM_REGEX) \b   #allowed opening or closing tags
        )
        ([^\s>]+)                           #bad tag to sanitize
        ( [^>]* (?:>|\z) )                  #attributes and end
        }{&lt;$2$3}msoigx
    );
    
    #remove bad attributes, and lowercase good tags and attributes
    $text =~ s{
        <\s*                                        # start tag
            ( /? ) \s* ($ALLOWED_ELEM_REGEX )      # $1: element
            \b\s*
            ([^>]*)                                 # $2: attributes
            (\z|>)                                  # end
    }{
        my ($end, $tag, $attr) = ($1, lc $2,$3);
        my @attr;
        
        $tag = $TRANSLATE_ELEMENTS{$tag} if $TRANSLATE_ELEMENTS{$tag};
        
        #any attribute not in quotes gets discarded
        while ($attr =~ m{(\S+?)\s*=\s*(["'])(.*?)\2}msg) {
            my $name = lc $1;
            next if !$ALLOWED_ATTRIBUTES{$name};
            my $v = $3;
            
            #check url protocol
            if ( $ATTRIBUTE_URL_VALUE{$name} ) {    #unescape
                ( my $plain = $v ) =~ s{&(\#?[xX]?(?:[0-9a-fA-F]+|\w+));}{
                    local $_ = $1;
                    /^amp$/i      ? '&'
                      : /^quot$/i ? '"'
                      : /^gt$/i   ? '>'
                      : /^lt$/i   ? '<'
                      : /^#(\d+)$/ ? chr($1)
                      : /^#x([0-9a-f]+)$/i ? chr( hex($1) )
                      : $_
                }gex;
                $plain =~ s/[\000-\040\177-\240]+//msg;
                $plain = lc $plain;
                next
                  if $plain =~ /^[a-z0-9][-+.a-z0-9]*:/
                  && !$ALLOWED_PROTOCOLS{ ( split( /:/, $plain ) )[0] };
                push @attr, 'rel="nofollow"';
            }
            
            #handle styles
            if ( $name eq 'style' ) {
                $v = sanitize_css($v);
            }
            push @attr, qq{$name="$v"};
        }
        $end ||= q{};
        my $close = substr($attr, -1 ,1) eq '/' ? ' />' : '>';
        $attr = @attr? join( q{ }, q{}, @attr) : join( q{ }, @attr) ;
        "<$end$tag$attr$close";
    }msiogxe;

    return $text;
}

sub sanitize_css {
    my $v = shift;
    $v = lc $v;
    $v =~ s/url\s*\(\s*[^\s)]+?\s*\)\s*/ /msg;
    
    return q{}
      if $v !~ /^(
            [:,;#%.\sa-zA-Z0-9!]
          | \w-\w
          | \'[\s\w]+\'
          | \"[\s\w]+\"
          | \([\d,\s]+\)
        )*$/msx;
    return q{}
      if $v !~ /^(\s* [-\w]+ \s* : \s* [^:;]* (;|$) )*$/msx;
    
    my @clean;
    while ( $v =~ /([-\w]+)\s*:\s*([^:;]*)/msg ) {
        push @clean, "$1: $2;" if $ALLOWED_CSS_PROPERTIES{$1};
    }
    return join(q{ }, @clean);
}


1;

__END__

=head1 NAME

BigMed::Filter - Base class for registering and dispatching Big Medium
text filters.

=head1 DESCRIPTION

The Big Medium content management system allows editors to enter content in
several different formats (WYSIWYG, Markdown, Moxie Code, raw HTML, etc).
Every format is associated with a "filter" which provides one or more routines
to prompt for text for that format in specific Big Medium applications, and
a single routine to process the received text into (X)HTML. BigMed::Filter
allows you to register, access, update and override these filters and
associated routines.

=head1 SYNOPSIS

    use BigMed::Filter;

    #register a filter
    BigMed::Filter->register_filter(
        label  => 'MyFilter',
        filter => \&my_filter,
        prompt => { 'BigMed::App::Web::Prompt' => \&web_prompt },
    );

    #update a filter by adding a prompt routine for another app
    BigMed::Filter->register_filter(
        label  => 'MyFilter',
        prompt => { 'BigMed::App::Email::Prompt' => \&web_prompt },
    );

    #update a filter by replacing its filter routine
    BigMed::Filter->register_filter(
        label  => 'MyFilter',
        filter => \&different_filter,
    );

    #retrieve the list of registered filters
    my @filter_labels = BigMed::Filter->all_filters();

    #run rich-text-formatted raw value through the filter
    my $filtered_text = BigMed::Filter->filter($raw_value);

    #get a prompt routine for an app
    my $web_coderef =
      BigMed::Filter->prompt_coderef( 'MyFilter', 'BigMed::App::Web::Prompt' );
    my $email_coderef =
      BigMed::Filter->prompt_coderef( 'MyFilter', 'BigMed::App::Email::Prompt' );
      
    #prepare the filter and text for display in prompt
    my ($text, $filter) =
      BigMed::Filter->preprompt_text_and_filter($app, $raw_value);

    #extract filter and text from raw value
    my ($filter, $text) = BigMed::Filter->extract_filter_and_text($raw_value);

    #remove a filter
    BigMed::Filter->remove_filter('MyFilter');

=head1 METHODS

=head2 C<< BigMed::Filter->register_filter(%arguments) >>

Registers a text filter based on the argument hash:

=over 4

=item * label => $filter_name

The filter's name (required).

=item * filter => \&filter

The coderef for the routine that processes the text into (X)HTML.

=item * prompt_coderef => \%prompt_routines

A hashref where the key is the class name of an application's prompt
handler (e.g. BigMed::App::Web::Prompt) and the value is the coderef
for the routine that generates the prompt.

=item * pre_prompt => \&pre_prompt

Optional code reference to run before prompting text with the filter.
The routine receives the application object and text as arguments and
must return the filter and text. Essentially:

    ($filter, $text) = &pre_prompt($app, $text);

This is a useful place to do text transformations or specify a different
filter (the RichText filter, for example, switches to Markdown for
unsupported browsers).

=item * browser_check => \&browser_check

Optional code reference to run to determine whether the current browser
supports this filter. The code should return a true value if the browser
is supported, false if not. The routine receives the application object
as the only argument.

=over 4

The method may be called multiple times for the same label, and each
time the existing registration will be updated to reflect the change.
For example, if you first register a filter like so:

    BigMed::Filter->register_filter(
        label => 'MyFilter',
        filter => \&my_filter,
        prompt => { 'BigMed::App::Web::Prompt' => \&web_prompt },
    );

...and subsequently call it again, adding a prompt for a different
prompting class:

    BigMed::Filter->register_filter(
        label => 'MyFilter',
        prompt => { 'BigMed::App::Email::Prompt' => \&email_prompt },
    );

Then the new Email prompt will be added to the existing Web prompt
in the registration. The original \&my_filter coderef will remain
the same.

=head2 C<< BigMed::Filter->remove_filter('label1', 'label2', ...) >>

Unregisters the filters whose labels are supplied as arguments.

=head2 C<< BigMed::Filter->all_filters() >>

Returns an array of all filter names. Filters are sorted alphabetically,
except for the "rich text" and "raw html" filters which appear first
and last respectively.

=head2 C<< BigMed::Filter->filter($raw_value) >>

Accepts a rich-text-formatted raw value (for example,
C<filter_name:unfiltered_text>) and returns the filtered text ready for
display as html.

=head2 C<< BigMed::Filter->prompt($filter_name, $prompt_class) >>

Returns the prompt code reference for the filter name and prompt class
supplied in the arguments. Undef one or both is unregistered.

=head2 C<< BigMed::Filter->filter_exists($filter_name) >>

Returns true if the specified filter has been registered.

=head2 C<< BigMed::Filter->browser_supports($app, $filter_name) >>

Returns true if the current browser supports the filter named in the
second argument.

=head2 C<< ($text, $filter) =
  BigMed::Filter->preprompt_text_and_filter($app, $raw_value) >>

Returns the text and filter to use for a user prompt when editing
content. The routine checks the filter's pre-prompt routine to make
sure that all is well (and change filters, for example, if the
current app or browser does not support the selected filter).

=head2 C<< ($filter, $text) =
  BigMed::Filter->extract_filter_and_text($raw_value) >>

Returns the filter and text from the raw rich-text-formatted value
in the argument.

=head2 C<< $use_filter = BigMed::Filter->browser_filter($app, $filter) >>

Checks that the filter in the second argument is supported by the browser
and if it exists. If so, the same filter is returned. If not, the 'RawHTML'
filter is returned.

=head2 C<< $text = BigMed::Filter->inline_single_graf($text)

Takes a html string (should already be filtered) and returns a string
stripped of first and last paragraph tags provided that it is just
a single paragraph. If the text includes more than a single paragraph
or any other block elements, it is returned untouched.

=head1 SEE ALSO

=over 4

=item * BigMed::Plugin

=back

=head1 AUTHOR & COPYRIGHTS

This module and all Big Medium modules are copyright Josh Clark
and Global Moxie. All rights reserved.

Use of this module and the Big Medium content
management system are governed by Global Moxie's software licenses
and may not be used outside of the terms and conditions outlined
there.

For more information, visit the Global Moxie website at
L<http://globalmoxie.com/>.

Big Medium and Global Moxie are service marks of Global Moxie
and Josh Clark. All rights reserved.

=cut

