# Copyright 2002-2008 Josh Clark and Global Moxie, LLC. This code cannot be 
# redistributed without permission from globalmoxie.com.  For more
# information, consult your Big Medium license.
#
# $Id: WYSIWYG.pm 3114 2008-06-16 16:29:12Z josh $

package BigMed::Filter::WYSIWYG;
use strict;
use utf8;
use Carp;
use BigMed::Filter;

my $lb_temp = time;

#removed font -- too much junk in there.
my $ALLOW_INLINE = '
    a           |
    abbr        |
    acronym     |
    big         |
    cite        |
    code        |
    del         |
    em          |
    ins         |
    kbd         |
    object      |
    param       |
    embed       |
    img         |
    q           |
    small       |
    span        |
    strike      |
    strong      |
    sub         |
    sup         |
    tt          |
    u';

my $ALLOW_BLOCK = '
    address     |
    area        |
    blockquote  |
    center      |
    div         |
    h\d         |
    li          |
    ol          |
    pre         |
    table       |
    tbody       |
    td          |
    tfoot       |
    th          |
    thead       |
    tr          |
    ul';

#p gets special treatment so it goes into misc.

my $ALLOW_MISC = '
    br          |
    caption     |
    col         |
    colgroup    |
    hr          |
    iframe      |
    img         |
    link        |
    p';

my $ALLOW_ALL = "$ALLOW_INLINE | $ALLOW_BLOCK | $ALLOW_MISC";

#don't include <br>:
my $SELFCLOSER = 'img | hr | link | col | colgroup | param';

#strip out tags surrouding no content except for these:
my $EMPTY_OK = 'a\ name | embed | param | object | iframe';


#strip these out in _degunk_word_styles
my $BAD_STYLES =
  '((MSO-[^:]+)|(FONT[^:]+)|PAGE-BREAK-BEFORE|TAB-STOPS|COLOR)';
my $BAD_ATTR = '(FACE|LANG)';


BigMed::Filter->register_filter(
    label  => 'RichText',
    filter => \&sanitize_html,
    prompt => { 'BigMed::App::Web::Prompt' => \&fckeditor_prompt },
    pre_prompt => \&browser_check,
    browser_check => sub{ browser_compatible( $_[0]->query->user_agent() ) },
);

sub fckeditor_prompt {
    my ( $app, $id, $roptions ) = @_;
    return undef if !browser_compatible( $app->query->user_agent() );
    my %options = $roptions ? %$roptions : ();
    my $toolbar = $options{toolbar} || 'bm-full';
    my $link =
      $app->env('BMADMINURL')
      . '/wysiwyg/editor/fckeditor.html'
      . "?InstanceName=$id&amp;Toolbar=$toolbar";
    
    my $value =
      defined $options{value}
      ? $app->escape( sanitize_html( $options{value} ) )
      : q{};

    my %config = ref $options{config} eq 'HASH' ? %{ $options{config} } : ();
    my @key_value =
      map { $app->escape($_) . '=' . $app->escape( $config{$_} ) }
      keys %config;

    $app->html_template(
        'wi_filter_richtext_prompt.tmpl',
        FIELD_NAME    => $id,
        FIELD_VALUE => $value,
        FRAME_LINK   => $link,
        CONFIG_STRING   => join('&amp;', @key_value),
        FRAME_WIDTH => $options{width} || '100%',
        FRAME_HEIGHT => $options{height} || '400',
    );
}


sub browser_check {
    my ( $app, $text ) = @_;
    $text = '' if !defined $text;
    if ( browser_compatible( $app->query->user_agent() ) ) {
        return ('RichText', $text);
    }
#    elsif ( BigMed::Filter->filter_exists('Markdown') ) {
#        return ('Markdown', html_to_markdown($text));
#    }
    else {
        $text = BigMed::Filter->filter("RichText:$text");
        $text = BigMed::Filter->inline_single_graf($text);
        my $filter = (!$text || $text !~ /</) ? 'Markdown' : 'RawHTML';
        return ($filter, $text);
    }
}

sub browser_compatible {
    my $agent = shift or return 0;
    my $version;

    if (   ( $agent =~ /MSIE/i )
        && !( $agent =~ /mac/i )
        && !( $agent =~ /Opera/i ) )
    {
        $version = substr( $agent, index( $agent, 'MSIE' ) + 5, 3 );
        return ( $version >= 5.5 );
    }
    elsif ( $agent =~ /Gecko\//i ) {
        $version = substr( $agent, index( $agent, 'Gecko/' ) + 6, 8 );
        return ( $version >= 20030210 );
    }
    elsif ( $agent =~ /Opera\//i ) {
        $version = substr( $agent, index( $agent, 'Opera/' ) + 6, 4 );
        return ( $version >= 9.5 );
    }
    elsif ( $agent =~ /AppleWebKit\/(\d+)/i ) {
        return ( $1 >= 522 );
    }
    else {
        return 0;    # 2.0 PR fix
    }
}

sub html_to_markdown { #TO DO!
    my $text = shift;
    $text;
}



sub sanitize_html {
    my($html) = @_;
    my $close = ' /'; #need to fix this

    #make all tags and attributes lowercase (for xhtml friendliness)
    #leaves attributes as-is
    $html =~ s/<\s*([^\s>]+)/<\L$1/g;

    #strip out line breaks; we'll work with line breaks as if they're <br>
    #(we'll turn <br> into line breaks later)
    $html =~ s/\s*\n+\s*/ /g;

    #transform i, b and strike --> em and strong
    $html =~ s/<(\/?)i>/<$1em>/g;
    $html =~ s/<(\/?)b>/<$1strong>/g;
    $html =~ s/<(\/?)strike>/<$1del>/g;

    $html = _scrub_bad_tags($html);
    $html = _degunk_word_styles($html);

    $html = _clean_inline_tags_across_lbs($html);
    
    #protect paragraph attributes
    $html =~ s{<p\s+([^>]+)>}{<BMFOO $1><p>}msg;

    #work with plain line breaks instead of <br> and <p>
    $html =~ s/ *<br[^>]*> */\n/ig;
    $html =~ s/ *<p( [^>]*)?> */\n\n/ig;
    $html =~ s/ *<\/p> *//ig;
        
    # CONVERT LINE BREAKS -> <BR> AND <P>
    # =========================================================
    
    #add <br> and <p> formatting, adding back protected p attributes
    $html =~ s/<BMFOO ([^>]+)>\n\n/<\/p>$lb_temp<p $1>/g;
    $html =~ s/\n\n/<\/p>$lb_temp<p>/g;
    $html =~ s/\n/<br$close>$lb_temp/g;

    if ($html !~ /^<($ALLOW_BLOCK)/ox) {
        $html = "<p>$html";
    }
    $html .= "</p>";
    
    # CLOSE UNCLOSED <P> TAGS
    # =========================================================

    1 while ($html =~ s/ <p>
            (
            [^<]*            # all safe content
            (?:    (?! <\/p>)    # can't match closing tag

            <[^<]*    )*        # match remaining safe stuff
            )
                            # <p> must be closed before it hits
                            # one of these open or close tags
            <(\/?)($ALLOW_BLOCK)([^>]*)>                

            /<p>$1<\/p>$lb_temp<$2$3$4>/ogx );

    # ELIMINATE UNOPENED </P> TAGS
    # =========================================================
    #depending on how midas stacks multiple divs, we could wind up
    #with extra </p> tags -- eliminate multiples
    $html =~ s/(<\/p>){2,}/<\/p>/g;
    
    #find any </p> that follows a block tag
    #without an intervening <p>
    
    $html =~ s/ <(\/?($ALLOW_BLOCK))( [^>]*)?>
            (
            [^<]*        # all safe content
                        # can't match open or close p or block
            (?:    (?!<\/?(p|$ALLOW_BLOCK)(?:[\s>]))

            <[^<]*    )*    # match remaining safe stuff
            )
            <\/p>        #dangling closed p

            /<$1$3>$4/ogx;

    # HANDLE EMPTY TAGS/ATTRIBUTES
    # =========================================================

    #insert space into empty <p> tags
    $html =~ s/<p><\/p>/<p> <\/p>/g;
    
    #make non-breaking space use the numeric for gentle xml behavior
    $html =~ s/&nbsp;/&#160;/g;
    
    #eliminate empty tags except those in EMPTY_OK
    1 while ($html =~ s/($lb_temp)*
        < (?! (?:$EMPTY_OK (?![a-zA-Z]) ) )
        (\S+) [^>]* >           #any tag other than empty_ok
        ((\s|$lb_temp)*)        #empty or whitespace
        <\/\2>($lb_temp)*       #close tag
        /$3/msoxg);

    $html =~ s/$lb_temp/\n/g;
    
    #remove trailing empty <p> tags
    $html =~ s{(\s*<p[^>]*>\s*(&nbsp;|&#160;)*\s*</p>)+\z}{}ims;

    return $html;

} # end prep_for_text routine

sub _degunk_word_styles {
    my $html = shift;
    $html =~ s{<o:p>\s*<\/o:p>}{}msg;
    $html =~ s{<o:p>.+?<\/o:p>}{&#160;}msg;
    $html =~ s{\s*$BAD_ATTR\s*=\s*"[^"]*"}{}msgi;
    $html =~ s{\s*face=[^ >]*}{}msgi;
    $html =~ s{class\s*=\s*"mso[^"]+"}{}msgi;
    
    $html =~ s{style\s*=\s*"([^"]*)"}{
        my $v = $1;
        $v =~ s{\s*$BAD_STYLES\s*:[^;]+;?}{}msgi;
        $v ? qq{style="$v"} : q{};
    }msgie;
    
    #word treats line breaks as paragraphs
    $html =~ s{<p\s*>\s*<span\s*>\s*(&#160;\s*)?</span>\s*</p>\s*}{}msgi;
    
    #get rid of empty spans (except with intervening "loaded" spans, can
    #live with that)
    1 while ($html =~ s/ <span\s*>
            (
            [^<]*                   # all safe content
            (?:    (?! <\/?span)    # can't match opening or closing tag

            <[^<]*    )*            # match remaining safe stuff
            )
            <\/span>                # closing tag

            /$1/ogx );
    
    return $html;
}

sub _scrub_bad_tags {
    my $html = shift;
    
    #get rid of any tags that are not loaded via the rich text editor and
    #so which must have been imported via other apps
    
    #replace any comment tags and contents
    $html =~ s/<!\-\-.*?\-\->//gs;

    #mild attempt to remove contents of bad containing tags
    $html =~ s/<\s*(script|style|select|head|textarea)[^>]*>.+?<\/\s*\1[^>]*>//gs;

    #remove all tags except those explicitly used by big medium
    $html =~ s/
        <                 #start tag
        (?! \/?           #include close tags too
            (             #anything other than the allowed tags
            $ALLOW_ALL
            )
            (?:\s|>|\/)   #followed by a space, slash or >
        )
        [^>]*>            #the bad tag to eliminate
    //ogx;    

    return $html;
}

sub _clean_inline_tags_across_lbs {
    my $html = shift;
    
    #only working with inline tags that are not self-closers
    my $OPENTAG = "(?!$SELFCLOSER) (?:$ALLOW_INLINE)";
    
    #move any inline closing tags that appear immediate after a <br>
    #and move them inside.        
    $html =~ s/\s*((<br[^>]*>\s*)+)((<\/($ALLOW_INLINE)>\s*)+)/$3$1/ogx;
    
    #likewise, move any open tags that appear immediate before a
    #<br> to the location immediate after (otherwise interferes
    #with properly generating paragraph tags)
    $html =~ s/\s*((<($OPENTAG(\ [^>]*)?)>\s*)+)((<br[^>]*>\s*)+)/$5$1/ogx;
    
    #close and reopen any inline tags that cross a <br>
    #boundary (could eventually mean the beginning/end of
    #a <p> tag). Have to be careful here to close the
    #tags in the right order.
    #still to sort out: eliminating anchor tags across lines        
    {
        no warnings; #can give noisy 'undefined value' warnings
        1 while ($html =~ s/
            < ($OPENTAG) (?![a-zA-z]) ([^>]*) >
            (
            [^<]*                               # all safe content ($3)
            (?:
                (?!
                    <\/\1>    |                 # can't match closing tag
                    <(?: br | p | $ALLOW_BLOCK ) (?![a-zA-Z]) [^>]* >
                )
            <[^<]*    )*?            # match remaining safe stuff
            )
                                # must be closed before it hits
                                # any close tags (not its own) $4
            (
                (<\/
                (?! \1>)
                [^>]*> )*?
            )
                                #...which are immediately
                                # followed by <br> or block $6
            ((<\/? (?: br | p | $ALLOW_BLOCK)  (?![a-zA-Z]) [^>]*>\s*)+)        
            
            ( (< (?!\/?\1>)        # and any open\/close tags not its
                [^>]*>)* )        # own $8
    
            /<$1$2>$3<\/$1>$4$6$8<$1$2>/ogx);
    }
    
    #if any of these tags were unclosed, then there's a straggler left
    #at the end of the string

    $html =~ s/(\s* < $OPENTAG (?![a-zA-Z]) [^>]* > \s* )+\z//ox;
    
    return $html;
}



1;

__END__
