# Copyright 2002-2008 Josh Clark and Global Moxie, LLC. This code cannot be # redistributed without permission from globalmoxie.com. For more # information, consult your Big Medium license. # # $Id: WYSIWYG.pm 3114 2008-06-16 16:29:12Z josh $ package BigMed::Filter::WYSIWYG; use strict; use utf8; use Carp; use BigMed::Filter; my $lb_temp = time; #removed font -- too much junk in there. my $ALLOW_INLINE = ' a | abbr | acronym | big | cite | code | del | em | ins | kbd | object | param | embed | img | q | small | span | strike | strong | sub | sup | tt | u'; my $ALLOW_BLOCK = ' address | area | blockquote | center | div | h\d | li | ol | pre | table | tbody | td | tfoot | th | thead | tr | ul'; #p gets special treatment so it goes into misc. my $ALLOW_MISC = ' br | caption | col | colgroup | hr | iframe | img | link | p'; my $ALLOW_ALL = "$ALLOW_INLINE | $ALLOW_BLOCK | $ALLOW_MISC"; #don't include
: my $SELFCLOSER = 'img | hr | link | col | colgroup | param'; #strip out tags surrouding no content except for these: my $EMPTY_OK = 'a\ name | embed | param | object | iframe'; #strip these out in _degunk_word_styles my $BAD_STYLES = '((MSO-[^:]+)|(FONT[^:]+)|PAGE-BREAK-BEFORE|TAB-STOPS|COLOR)'; my $BAD_ATTR = '(FACE|LANG)'; BigMed::Filter->register_filter( label => 'RichText', filter => \&sanitize_html, prompt => { 'BigMed::App::Web::Prompt' => \&fckeditor_prompt }, pre_prompt => \&browser_check, browser_check => sub{ browser_compatible( $_[0]->query->user_agent() ) }, ); sub fckeditor_prompt { my ( $app, $id, $roptions ) = @_; return undef if !browser_compatible( $app->query->user_agent() ); my %options = $roptions ? %$roptions : (); my $toolbar = $options{toolbar} || 'bm-full'; my $link = $app->env('BMADMINURL') . '/wysiwyg/editor/fckeditor.html' . "?InstanceName=$id&Toolbar=$toolbar"; my $value = defined $options{value} ? $app->escape( sanitize_html( $options{value} ) ) : q{}; my %config = ref $options{config} eq 'HASH' ? %{ $options{config} } : (); my @key_value = map { $app->escape($_) . '=' . $app->escape( $config{$_} ) } keys %config; $app->html_template( 'wi_filter_richtext_prompt.tmpl', FIELD_NAME => $id, FIELD_VALUE => $value, FRAME_LINK => $link, CONFIG_STRING => join('&', @key_value), FRAME_WIDTH => $options{width} || '100%', FRAME_HEIGHT => $options{height} || '400', ); } sub browser_check { my ( $app, $text ) = @_; $text = '' if !defined $text; if ( browser_compatible( $app->query->user_agent() ) ) { return ('RichText', $text); } # elsif ( BigMed::Filter->filter_exists('Markdown') ) { # return ('Markdown', html_to_markdown($text)); # } else { $text = BigMed::Filter->filter("RichText:$text"); $text = BigMed::Filter->inline_single_graf($text); my $filter = (!$text || $text !~ /= 5.5 ); } elsif ( $agent =~ /Gecko\//i ) { $version = substr( $agent, index( $agent, 'Gecko/' ) + 6, 8 ); return ( $version >= 20030210 ); } elsif ( $agent =~ /Opera\//i ) { $version = substr( $agent, index( $agent, 'Opera/' ) + 6, 4 ); return ( $version >= 9.5 ); } elsif ( $agent =~ /AppleWebKit\/(\d+)/i ) { return ( $1 >= 522 ); } else { return 0; # 2.0 PR fix } } sub html_to_markdown { #TO DO! my $text = shift; $text; } sub sanitize_html { my($html) = @_; my $close = ' /'; #need to fix this #make all tags and attributes lowercase (for xhtml friendliness) #leaves attributes as-is $html =~ s/<\s*([^\s>]+)/<\L$1/g; #strip out line breaks; we'll work with line breaks as if they're
#(we'll turn
into line breaks later) $html =~ s/\s*\n+\s*/ /g; #transform i, b and strike --> em and strong $html =~ s/<(\/?)i>/<$1em>/g; $html =~ s/<(\/?)b>/<$1strong>/g; $html =~ s/<(\/?)strike>/<$1del>/g; $html = _scrub_bad_tags($html); $html = _degunk_word_styles($html); $html = _clean_inline_tags_across_lbs($html); #protect paragraph attributes $html =~ s{]+)>}{

}msg; #work with plain line breaks instead of
and

$html =~ s/ *]*> */\n/ig; $html =~ s/ *]*)?> */\n\n/ig; $html =~ s/ *<\/p> *//ig; # CONVERT LINE BREAKS ->
AND

# ========================================================= #add
and

formatting, adding back protected p attributes $html =~ s/]+)>\n\n/<\/p>$lb_temp

/g; $html =~ s/\n\n/<\/p>$lb_temp

/g; $html =~ s/\n/$lb_temp/g; if ($html !~ /^<($ALLOW_BLOCK)/ox) { $html = "

$html"; } $html .= "

"; # CLOSE UNCLOSED

TAGS # ========================================================= 1 while ($html =~ s/

( [^<]* # all safe content (?: (?! <\/p>) # can't match closing tag <[^<]* )* # match remaining safe stuff ) #

must be closed before it hits # one of these open or close tags <(\/?)($ALLOW_BLOCK)([^>]*)> /

$1<\/p>$lb_temp<$2$3$4>/ogx ); # ELIMINATE UNOPENED

TAGS # ========================================================= #depending on how midas stacks multiple divs, we could wind up #with extra

tags -- eliminate multiples $html =~ s/(<\/p>){2,}/<\/p>/g; #find any

that follows a block tag #without an intervening

$html =~ s/ <(\/?($ALLOW_BLOCK))( [^>]*)?> ( [^<]* # all safe content # can't match open or close p or block (?: (?!<\/?(p|$ALLOW_BLOCK)(?:[\s>])) <[^<]* )* # match remaining safe stuff ) <\/p> #dangling closed p /<$1$3>$4/ogx; # HANDLE EMPTY TAGS/ATTRIBUTES # ========================================================= #insert space into empty

tags $html =~ s/

<\/p>/

<\/p>/g; #make non-breaking space use the numeric for gentle xml behavior $html =~ s/ / /g; #eliminate empty tags except those in EMPTY_OK 1 while ($html =~ s/($lb_temp)* < (?! (?:$EMPTY_OK (?![a-zA-Z]) ) ) (\S+) [^>]* > #any tag other than empty_ok ((\s|$lb_temp)*) #empty or whitespace <\/\2>($lb_temp)* #close tag /$3/msoxg); $html =~ s/$lb_temp/\n/g; #remove trailing empty

tags $html =~ s{(\s*]*>\s*( | )*\s*

)+\z}{}ims; return $html; } # end prep_for_text routine sub _degunk_word_styles { my $html = shift; $html =~ s{\s*<\/o:p>}{}msg; $html =~ s{.+?<\/o:p>}{ }msg; $html =~ s{\s*$BAD_ATTR\s*=\s*"[^"]*"}{}msgi; $html =~ s{\s*face=[^ >]*}{}msgi; $html =~ s{class\s*=\s*"mso[^"]+"}{}msgi; $html =~ s{style\s*=\s*"([^"]*)"}{ my $v = $1; $v =~ s{\s*$BAD_STYLES\s*:[^;]+;?}{}msgi; $v ? qq{style="$v"} : q{}; }msgie; #word treats line breaks as paragraphs $html =~ s{\s*\s*( \s*)?\s*

\s*}{}msgi; #get rid of empty spans (except with intervening "loaded" spans, can #live with that) 1 while ($html =~ s/ ( [^<]* # all safe content (?: (?! <\/?span) # can't match opening or closing tag <[^<]* )* # match remaining safe stuff ) <\/span> # closing tag /$1/ogx ); return $html; } sub _scrub_bad_tags { my $html = shift; #get rid of any tags that are not loaded via the rich text editor and #so which must have been imported via other apps #replace any comment tags and contents $html =~ s///gs; #mild attempt to remove contents of bad containing tags $html =~ s/<\s*(script|style|select|head|textarea)[^>]*>.+?<\/\s*\1[^>]*>//gs; #remove all tags except those explicitly used by big medium $html =~ s/ < #start tag (?! \/? #include close tags too ( #anything other than the allowed tags $ALLOW_ALL ) (?:\s|>|\/) #followed by a space, slash or > ) [^>]*> #the bad tag to eliminate //ogx; return $html; } sub _clean_inline_tags_across_lbs { my $html = shift; #only working with inline tags that are not self-closers my $OPENTAG = "(?!$SELFCLOSER) (?:$ALLOW_INLINE)"; #move any inline closing tags that appear immediate after a
#and move them inside. $html =~ s/\s*((]*>\s*)+)((<\/($ALLOW_INLINE)>\s*)+)/$3$1/ogx; #likewise, move any open tags that appear immediate before a #
to the location immediate after (otherwise interferes #with properly generating paragraph tags) $html =~ s/\s*((<($OPENTAG(\ [^>]*)?)>\s*)+)((]*>\s*)+)/$5$1/ogx; #close and reopen any inline tags that cross a
#boundary (could eventually mean the beginning/end of #a

tag). Have to be careful here to close the #tags in the right order. #still to sort out: eliminating anchor tags across lines { no warnings; #can give noisy 'undefined value' warnings 1 while ($html =~ s/ < ($OPENTAG) (?![a-zA-z]) ([^>]*) > ( [^<]* # all safe content ($3) (?: (?! <\/\1> | # can't match closing tag <(?: br | p | $ALLOW_BLOCK ) (?![a-zA-Z]) [^>]* > ) <[^<]* )*? # match remaining safe stuff ) # must be closed before it hits # any close tags (not its own) $4 ( (<\/ (?! \1>) [^>]*> )*? ) #...which are immediately # followed by
or block $6 ((<\/? (?: br | p | $ALLOW_BLOCK) (?![a-zA-Z]) [^>]*>\s*)+) ( (< (?!\/?\1>) # and any open\/close tags not its [^>]*>)* ) # own $8 /<$1$2>$3<\/$1>$4$6$8<$1$2>/ogx); } #if any of these tags were unclosed, then there's a straggler left #at the end of the string $html =~ s/(\s* < $OPENTAG (?![a-zA-Z]) [^>]* > \s* )+\z//ox; return $html; } 1; __END__