# Copyright 2002-2008 Josh Clark and Global Moxie, LLC. This code cannot be
# redistributed without permission from globalmoxie.com.  For more
# information, consult your Big Medium license.
#
# $Id: Parse.pm 3242 2008-08-23 13:11:15Z josh $

package BigMed::App::Web::Parse;
use strict;
use utf8;
use Carp;
$Carp::Verbose = 1;
use BigMed::App;
use BigMed::Elements;
use BigMed::DiskUtil ('bm_untaint_filepath');

sub register_all {
    my $app_class = shift;
    $app_class->register_parser(
        _default => sub { _parse_raw_text(@_) },

        #BigMed::Elements types
        alignment               => \&_parse_alignment,
        body_position           => \&_parse_body_pos,
        boolean                 => \&_parse_boolean,
        document                => \&_parse_document_upload,
        id                      => \&_parse_id,
        dir_path                => \&_parse_dir_path,
        dir_url                 => \&_parse_dir_url,
        email                   => \&_parse_email,
        image_file              => \&_parse_image_file,
        key_boolean             => \&_key_boolean,
        kilobytes               => \&_parse_kilobytes,
        link_info               => \&_link_info,
        number_integer_positive => \&_parse_number_integer_positive,
        navigation_style        => \&_parse_value_list,
        raw_text                => \&_parse_raw_text,
        rich_text               => \&_parse_rich_text,
        rich_text_brief         => \&_parse_rich_text,
        rich_text_public        => \&_parse_rich_text_public,
        rich_text_inline        => \&_parse_rich_text_inline,
        password                => \&_parse_password_username,
        select_section          => \&_parse_select_section,
        simple_text             => \&_parse_simple_text,
        sort_order              => \&_parse_sort_order,
        system_time             => \&_parse_system_time,
        tags                    => \&_parse_tags,
        time_offset             => \&_parse_time_offset,
        url                     => \&_url,
        url_safe                => \&_parse_url_safe,
        username                => \&_parse_password_username,
        value_freeform          => \&_parse_value_freeform,
        value_list              => \&_parse_value_list,
        value_several           => \&_parse_value_several,
    );
}

###########################################################
# PARSE HANDLERS
###########################################################

#all parse routines receive:
#$_[0]: app object
#$_[1]: field
#$_[2]: hash ref with options

my $required_alert = 'PARSE_ERR_Please provide a value.';
my $nofield_alert  = 'PARSE_ERR_Missing field';
my $invalid_alert  = 'BM_ERR_Invalid value';
my $not_utf8       = 'PARSE_ERR_Unknown Character Encoding';

sub _parse_boolean {
    return $_[0]->utf8_param( $_[1] ) ? 1 : '';
}

sub _key_boolean {
    my %value;
    my @checked = ( $_[0]->utf8_param( $_[1] ) );
    @value{@checked} = (1) x @checked;
    \%value;
}

sub _parse_alignment {
    my ( $app, $fieldname, $opts ) = @_;
    $opts->{options} ||= ['default', 'left', 'center', 'right'];
    _parse_simple_text( $app, $fieldname, $opts );
}

sub _parse_body_pos {
    my ( $app, $fieldname, $opts ) = @_;
    defined( my $value = $app->utf8_param($fieldname) )
      or return ( undef, [$nofield_alert, $fieldname] );
    $value = _meta_no_linebreaks_or_html($value);
    return ( undef, $required_alert ) if $_[2]->{required} && !$value;
    my @opts =
      $opts->{options}
      ? @{ $opts->{options} }
      : ( 'above', 'block', 'below', 'hidden' );
    return ( undef, $invalid_alert )
      if !_matches_prefab_option( $value, \@opts );

    if ( $value eq 'block' ) {    #get paragraph number
        my $par_num = $app->utf8_param( $fieldname . '___PARNUM' );
        if (   defined( $par_num = _scrub_make_number($par_num) )
            && $par_num > 0
            && $par_num == int($par_num) )
        {
            $value = "block:$par_num";
        }
        else {
            return ( undef, 'PARSE_ERR_Must be a positive integer' );
        }
    }
    return $value;
}

sub _link_info {
    my ( $app, $fieldname, $opts ) = @_;
    defined( my $tab = $app->utf8_param($fieldname) )
      or return ( undef, [$nofield_alert, $fieldname] );
    if ( $tab eq 'internal' ) {
        my $id = $app->utf8_param( 'BM_LIB_ID_PAGE_' . $fieldname )
          or return ( undef, $required_alert );
        my $site = _current_site($app) or $app->error_stop;
        my $site_id = $site->id;
        defined( my $obj =
              BigMed::Content::Page->fetch( { site => $site_id, id => $id } )
          )
          or return undef, ['PARSE_ERR_Could not locate page', $id];
        return { url => "bm://$site_id/$id" };
    }
    else {
        my $url  = $app->utf8_param( $fieldname . '__url' );
        my $text = $app->utf8_param( $fieldname . '__text' );
        $url  = _scrub_trailing_space_linebreaks($url);
        $text = _meta_no_linebreaks_or_html($text);
        if ( $url eq '' || $url eq 'http://' || $text eq '' ) {
            return ( undef,
                'PARSE_ERR_Please provide both link text and URL' );
        }
        return ( { url => $url, text => $text } );
    }
}

sub _parse_document_upload {
    my ( $app, $fieldname, $rparam ) = @_;

    #for docs do not use utf8_param
    my $q        = $app->query;
    my $filename = $q->param($fieldname) or return [];
    my $DOCUMENT = $q->upload($fieldname) or return [];
    my $tmpfile  = $q->tmpFileName($filename) or return [];

    #check document type
    my $suffix = ( $filename =~ /[.]([a-zA-Z0-9]+)$/ ) ? lc $1 : '';
    return ( undef, 'PARSE_Unknown file type' ) if !$suffix;
    my @file_types =
      $rparam->{file_types}
      ? @{ $rparam->{file_types} }
      : @{ $app->env('FILETYPES') };
    my %check_type = map { ( $_ => 1 ) } @file_types;
    if ( !$check_type{$suffix} ) {
        return ( undef, ['PARSE_Not an approved file type', $suffix] );
    }

    #check doc length
    my $length = int( ( -s $DOCUMENT ) / 1024 ) + 1;    #kilobytes
    my $limit = $rparam->{size_limit}
      || (
          $app->can('current_site')
        ? $app->current_site->site_doclimit
        : ( $app->env('ADMIN_DOCLIMIT') || 5120 )
      );

    if ($limit) {
        if ( $length > $limit ) {
            foreach my $n ( $length, $limit ) {
                $n =
                  $n >= 1024
                  ? sprintf( "%.1f", ( $n / 1024 ) ) . ' MB'
                  : "${n} KB";
            }
            return ( undef,
                ['PARSE_File upload too large', $length, $limit] );
        }
    }
    return [_safe_filename_from_path($filename), $tmpfile, $DOCUMENT];

}    #end save_upload routine

sub _parse_email {
    defined( my $value = $_[0]->utf8_param( $_[1] ) )
      or return ( undef, [$nofield_alert, $_[1]] );
    $value = _scrub_trailing_leading_ws($value);

    if ( !$value && $_[2]->{required} ) {
        return ( undef, $required_alert );
    }
    elsif ( !$value ) {    #don't run validator if empty and not required
        return '';
    }
    require Email::Valid;
    return Email::Valid->address($value)
      || ( undef, 'PARSE_ERR_email' );
}

sub _parse_image_file {
    my ( $app, $fieldname, $rparam ) = @_;

    require BigMed::Media::Image;
    my $manual = !BigMed::Media::Image->can_thumbnail
      || !$app->utf8_param("${fieldname}__AUTO");
    my @id;
    if ($manual) {    #load all formats for a manual entry
        @id = (
            'orig',
            map { $_->[1] }
              BigMed::Media::Image->image_formats( _current_site($app) )
        );
    }
    else {
        @id = ('__MSTR');
    }

    #go through each image size; take url if exists, otherwise file.
    #if neither, ignore it (this effectively means that you cannot clear
    #a url or field, but I think that's for the best).
    my @images;
    foreach my $ext (@id) {
        my $this_field = "$fieldname$ext";
        my $dim = $ext eq '__MSTR' ? 'orig' : $ext;
        my $url;    #get the url for manual entries only
        if ($manual) {
            $url = $app->utf8_param("${this_field}__URL") || undef;
            $url &&= _scrub_url($url);
            if ($url) {    #url gets precedence, take it and move on
                push @images,
                  { url        => $url,
                    dimensions => $dim,
                    manual     => 1,
                  };
                next;
            }
        }
        my ( $rupload, $rdocerr ) =
          _parse_document_upload( $app, $this_field, $rparam );
        return ( $rupload, $rdocerr ) if !$rupload;   #parse error on document

        my ( $filename, $tmpfile, $IMAGE ) = @$rupload;
        next if !$filename;                           #nothing here

        #got a file upload, get the file info
        if ( !BigMed::Media::Image->filetype_supported($filename) ) {
            close $IMAGE if $IMAGE;  #can be treated as no file at all if .txt
            my $bad = ( $filename =~ m{.+[.]([a-zA-Z0-9]+)$} ) ? lc $1 : q{};
            return (
                undef,
                [   'IMAGE_TEXT_Unsupported file format',
                    $bad,
                    'gif, png, jpg, jpeg, jfif'
                ],
            );
        }
        push @images,
          { filename   => $filename,
            tempfile   => $tmpfile,
            filehandle => $IMAGE,
            dimensions => $dim,
            manual     => $manual,
          };
    }
    return \@images;
}

sub _parse_kilobytes {
    defined( my $value = $_[0]->utf8_param( $_[1] ) )
      or return ( undef, [$nofield_alert, $_[1]] );
    defined( my $units = $_[0]->utf8_param("$_[1]___UNITS") )
      or return ( undef, [$nofield_alert, $_[1]] );

    $value =~ s/\s+//g;    #scrub all white space
    return ( undef, $required_alert ) if $_[2]->{required} && $value eq '';
    return '' if $value eq '';

    if ( !defined( $value = _scrub_make_number($value) )
        || $value < 0 )
    {
        return ( undef, ['PARSE_ERR_Must be number zero or higher', $_[1]] );
    }
    $value *= 1024 if $units eq 'MB';
    return $value;
}

sub _parse_password_username {

    #these get parsed this way for legacy reasons; if we did it
    #differently, could run into problems with user/pass validation
    #for a few edge cases.
    defined( my $value = $_[0]->utf8_param( $_[1] ) )
      or return ( undef, [$nofield_alert, $_[1]] );
    $value =~ s/[\cM\n\r]//g;    #no line breaks
    $value = _scrub_html_encode( _scrub_trailing_leading_ws($value) );
    return ( undef, $required_alert ) if $_[2]->{required} && $value eq '';
    $value;
}

sub _parse_raw_text {
    defined( my $value = $_[0]->utf8_param( $_[1] ) )
      or return ( undef, [$nofield_alert, $_[1]] );
    $value =
      _scrub_trailing_leading_ws( _scrub_normalize_linebreaks($value) );
    return ( undef, $required_alert ) if $_[2]->{required} && $value eq '';
    return ( undef, $invalid_alert )
      if !_matches_prefab_option( $value, $_[2]->{options} );
    $value;
}

sub _parse_rich_text {
    my ( $app, $fieldname, $opts ) = @_;
    defined( my $filtername = $app->utf8_param($fieldname) )
      or return ( undef, [$nofield_alert, $fieldname] );
    $filtername = _scrub_trailing_leading_ws($filtername);
    if ( $filtername eq '' ) {
        return $opts->{required} ? ( undef, $required_alert ) : '';
    }
    defined( my $text = $app->utf8_param( $filtername . $fieldname ) )
      or return ( undef, [$nofield_alert, $filtername . $fieldname] );
    $text = _scrub_trailing_leading_ws( _scrub_normalize_linebreaks($text) );
    return ( undef, $required_alert ) if $opts->{required} && $text eq '';
    require BigMed::Filter;
    BigMed::Filter->filter_exists($filtername)
      or return ( undef, ['PARSE_ERR_Unknown filter', $filtername] );
    return "$filtername:$text";
}

sub _parse_rich_text_inline {
    defined( my $value = $_[0]->utf8_param( $_[1] ) )
      or return ( undef, [$nofield_alert, $_[1]] );

    #same as simple_text but with italic/bold handling via markdown
    $value = _meta_no_linebreaks_or_html($value) if $value;
    return ( undef, $required_alert ) if $_[2]->{required} && $value eq q{};
    require BigMed::Filter::Markdown;

    package BigMed::Filter::Markdown;
    $value = _EscapeSpecialCharsWithinTagAttributes($value);
    $value = _EncodeBackslashEscapes($value);
    $value = _DoItalicsAndBold($value);
    $value = _UnescapeSpecialChars($value);
    $value;
}

sub _parse_rich_text_public {
    my ( $app, $fieldname, $opts ) = @_;
    defined( my $text = $app->utf8_param($fieldname) )
      or return ( undef, [$nofield_alert, $fieldname] );
    $text = _scrub_trailing_leading_ws( _scrub_normalize_linebreaks($text) );
    return ( undef, $required_alert ) if $opts->{required} && $text eq '';
    return "Markdown:$text";
}

sub _parse_select_section {
    my ( $app, $id, $roptions ) = @_;

    #collect the site or bail out
    my %options = %$roptions;
    my $site    = $options{site} || _current_site($app) || $app->error_stop;

    #gather the sections where the user has permission, if applicable;
    #if no user, give all sections
    my @sections;
    my $user = $options{user}
      || ( $app->can('current_user') ? $app->current_user : undef );
    if ($user) {
        my %allowed = $user->allowed_section_hash($site);
        @sections = grep { $allowed{$_} } ($site->homepage_id, $site->all_descendants_ids() );
    }
    else {
        @sections = ($site->all_descendants_ids(), $site->homepage_id);
    }
    $options{options} = \@sections;
    $app->parse( 'value_list', $id, \%options );
}

sub _parse_simple_text {
    defined( my $value = $_[0]->utf8_param( $_[1] ) )
      or return ( undef, [$nofield_alert, $_[1]] );
    $value = _meta_no_linebreaks_or_html($value) if $value;
    return ( undef, $required_alert ) if $_[2]->{required} && $value eq '';
    return ( undef, $invalid_alert )
      if !_matches_prefab_option( $value, $_[2]->{options} );
    $value;
}

sub _parse_sort_order {
    my @parsed = _parse_value_several(@_);
    return @parsed if !defined $parsed[0];    #error
    my @value = @{ $parsed[0] };
    my %saw_it;
    my @sort;
    my @order;
    foreach my $sort (@value) {
        next if $saw_it{$sort};
        $saw_it{$sort} = 1;
        push @sort, ( $sort eq 'chron_time' ? 'pub_time' : $sort );
        push @order,
          ( $sort eq 'title' || $sort eq 'chron_time' ) ? 'a' : 'd';
    }
    return '' if !@sort;
    return join( ':', @sort ) . '|' . join( ':', @order );
}

sub _parse_tags {
    my ( $value, $alert ) = _parse_simple_text(@_);
    return ( $value, $alert ) if !defined $value;
    $value =~ s/\s+/ /ms;
    my @values =
      grep { defined $_ && $_ ne q{} }
      split( /\s*,\s*/ms, $value );
    return \@values;
}

sub _url {
    defined( my $value = $_[0]->utf8_param( $_[1] ) )
      or return ( undef, [$nofield_alert, $_[1]] );
    $value = _scrub_url($value);
    return ( undef, $required_alert ) if $_[2]->{required} && !$value;
    $value;
}

sub _parse_value_freeform {
    my ( $app, $id, $rparam ) = @_;
    defined( my $value = $_[0]->utf8_param($id) )
      or return ( undef, [$nofield_alert, $id] );
    $value = _scrub_normalize_linebreaks($value);
    my @value;
    foreach my $line ( split( /\n/, $value ) ) {
        $line = _scrub_trailing_leading_ws($line);
        push @value, $line if $line ne '';
    }
    return ( undef, $required_alert ) if $_[2]->{required} && !@value;
    \@value;
}

sub _parse_value_list {
    my @value;
    if ( $_[2]->{multiple} ) {
        @value = $_[0]->utf8_param( $_[1] );
    }
    else {
        defined( my $value = $_[0]->utf8_param( $_[1] ) )
          or return ( undef, [$nofield_alert, $_[1]] );
        @value = ($value);
    }

    #leave out any empty values (but don't transform the values)
    my @final = grep { _scrub_trailing_space_linebreaks($_) ne '' } @value;
    return ( undef, $required_alert ) if $_[2]->{required} && !@final;
    return ( undef, $invalid_alert )
      if !_matches_prefab_option( \@final, $_[2]->{options} );
    return $_[2]->{multiple}
      ? \@final
      : ( defined $final[0] ? $final[0] : '' );
}

sub _parse_value_several {
    my ( $app, $id, $rparam ) = @_;
    return [] if $rparam->{optional_label} && !$app->utf8_param("${id}___optional");
    my $numfields = $app->utf8_param($id)
      or return ( undef, [$nofield_alert, $id] );
    my @value;
    foreach my $i ( 1 .. $numfields ) {
        defined( my $value = $app->utf8_param("${id}___$i") )
          or return ( undef, [$nofield_alert, "${id}___$i"] );

        #check if scrubbed value is empty but don't transform actual value
        next if _scrub_trailing_space_linebreaks($value) eq '';
        push @value, $value;
    }
    return ( undef, $required_alert )
      if !@value && $rparam->{required};
    return ( undef, $invalid_alert )
      if !_matches_prefab_option( \@value, $rparam->{options} );
    \@value;
}

sub _parse_dir_path {
    defined( my $value = $_[0]->utf8_param( $_[1] ) )
      or return ( undef, [$nofield_alert, $_[1]] );
    $value = _scrub_trailing_leading_ws($value) if $value;
    $value =~ s{[\cM\n\r]}{}g;         #no line breaks
    
    #UNC-format paths can start with two leading slashes
    $value =~ s{([^\\\/])([\\/]){2,}}{$1$2}g; #remove non-leading double slashes
    $value =~ s{\A([\\/]{2})[\\/]+}{$1}g; #remove more than two leading slashes

    $value =~ s{^(.+?)[\\/]+$}{$1};    #remove trailing slash
    if ( $value eq '' ) {
        return ( undef, $required_alert ) if $_[2]->{required};
        return $value;    # "" won't pass as valid file path, pass back now
    }
    $value = bm_untaint_filepath($value)
      or return ( undef, 'PARSE_ERR_dir_path contains illegal characters' );
    my $is_windows = ( index( $^O, 'MSWin' ) == 0 );

    # unc path can be used in windows, like this:
    # \\network-name\path\to\directory
    if ( $is_windows && $value !~ m{^[a-zA-Z]:[\\/]} && $value !~ m{^[\\/]{2}} ) {
        my $example = 'C:\path\to\file';
        return ( undef, ['PARSE_ERR_dir_path must be absolute', $example] );
    }
    elsif ( !$is_windows && $value !~ m{^/} ) {    #bad unix dir
        my $example = '/path/to/file';
        return ( undef, ['PARSE_ERR_dir_path must be absolute', $example] );
    }
    return $value;
}

sub _parse_dir_url {
    defined( my $value = $_[0]->utf8_param( $_[1] ) )
      or return ( undef, [$nofield_alert, $_[1]] );
    $value = _scrub_trailing_leading_ws($value) if $value;
    $value =~ s{[\cM\n\r]}{}g;                     #no line breaks
    $value =~ s{^(https?://.*?)[\\/]+$}{$1};       #remove trailing slash
    return ( undef, $required_alert ) if $_[2]->{required} && $value eq '';
    return '' if $value eq '';

    #make sure it starts with http(s) and valid host name
    my $IPv4address = '([0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+)';
    my $toplabel    = '([a-zA-Z][-a-zA-Z0-9]*[a-zA-Z0-9]|[a-zA-Z])';
    my $domainlabel = '(([-a-zA-Z0-9][-a-zA-Z0-9]*)?[-a-zA-Z0-9])';
    my $hostName    = "(($domainlabel\\.)*$toplabel\\.?)";
    my $host        = "($IPv4address|$hostName)";
    $value =~ m{^https?://$host}
      or return ( undef, 'PARSE_ERR_dir_url http format' );
    $value;
}

sub _parse_id {
    defined( my $value = $_[0]->utf8_param( $_[1] ) )
      or return ( undef, [$nofield_alert, $_[1]] );
    if (   defined( $value = _scrub_make_number($value) )
        && $value > 0
        && $value == int($value) )
    {
        return $value;
    }
    return $_[2]->{required} ? ( undef, $required_alert ) : '';
}

sub _parse_number_integer_positive {
    defined( my $value = $_[0]->utf8_param( $_[1] ) )
      or return ( undef, [$nofield_alert, $_[1]] );

    $value =~ s/\s+//g;    #scrub all white space
    return ( undef, $required_alert ) if $_[2]->{required} && $value eq '';
    return '' if $value eq '';
    if (   defined( $value = _scrub_make_number($value) )
        && $value > 0
        && $value == int($value) )
    {
        return $value;
    }
    else {
        return ( undef, 'PARSE_ERR_Must be a positive integer' );
    }
}

sub _parse_system_time {
    my ( $app, $id, $rparam ) = @_;
    return ''
      if $rparam->{option_label} && !$app->utf8_param("${id}___optional");
    my @err;
    my %value = ( offset => $rparam->{offset} );

    foreach my $v qw(month day year hour minute) {
        defined( $value{$v} = $app->utf8_param("${id}___$v") )
          or push @err, $app->language("PARSE_TIME_$v");
    }
    return ( undef, ['PARSE_ERR_Missing time info', join( ", ", @err )] )
      if @err;
    if ( $app->utf8_param("${id}___ampm") && $app->utf8_param("${id}___ampm") eq 'pm' ) {
        $value{hour} += 12 if $value{hour} < 12;
    }
    return BigMed->time_obj(%value)->bigmed_time;
}

sub _parse_time_offset {
    my $app = shift;
    my $id  = shift;
    my ( $sign, $hour, $minute ) = (
        $app->utf8_param( $id . '-sign' ),
        $app->utf8_param( $id . '-hour' ),
        $app->utf8_param( $id . '-minute' ),
    );
    if ( !defined $sign || !defined $hour || !defined $minute ) {
        return ( undef, [$nofield_alert, $id] );
    }
    if ( $sign ne '+' && $sign ne '-' ) {
        return ( undef, 'PARSE_ERR_Sign must be "+" or "-"' );
    }
    foreach my $value ( $hour, $minute ) {
        if (   !defined( $value = _scrub_make_number($value) )
            || $value < 0
            || $value != int($value) )
        {
            return ( undef,
                'PARSE_ERR_Hours and minutes must be integers >= 0' );
        }
        $value += 0;
    }
    if ( $hour > 23 ) {
        return ( undef, 'PARSE_ERR_Hour must be a number from 0 to 23' );
    }
    if ( $minute > 59 ) {
        return ( undef, 'PARSE_ERR_Minute must be a number from 0 to 59' );
    }
    $sign . $hour . ':' . sprintf( '%02d', $minute );
}

sub _parse_url_safe {
    defined( my $value = $_[0]->utf8_param( $_[1] ) )
      or return ( undef, [$nofield_alert, $_[1]] );
    $value = _scrub_trailing_leading_ws($value);
    if ( $value ne _scrub_url_safe($value) ) {
        return ( undef, 'PARSE_ERR_Only alphanumerics hyphens underscores' );
    }
    return ( undef, $required_alert ) if $_[2]->{required} && $value eq '';
    $value;
}

###########################################################
# META SCRUBBERS -- COMMON COMBINATIONS
###########################################################

sub _meta_no_linebreaks_or_html {

    #get rid of leading/trailing spaces and all linebreaks. all whitespace
    #(including non-breaking spaces) are turned into a single space.
    #encodes html
    _scrub_html_encode( _scrub_linebreaks_multi_ws( $_[0] ) );
}

###########################################################
# STRING-SCRUBBING ROUTINES... hygiene first!
###########################################################

sub _matches_prefab_option {
    my $value = shift;
    my $ropts = shift;
    return 1 if !$ropts || !defined $value || $value eq '';
    my @value = ref $value ? @$value : ($value);
    my @opts  = ref $ropts ? @$ropts : ($ropts);
    my %check;
    @check{@opts} = ();
    foreach my $v (@value) {
        return undef if !exists $check{$v};
    }
    return 1;
}

sub _scrub_trailing_leading_ws {

    # eliminate leading/trailing spaces
    ( my $value = $_[0] ) =~ s/\A\s+//;
    $value =~ s/\s+\z//;
    $value;
}

sub _scrub_linebreaks {
    my $value = shift;
    $value =~ s/[\cM\n\r]+/ /g;
    $value;
}

sub _scrub_url {
    my $value = shift;
    $value = _scrub_trailing_space_linebreaks($value);
    $value =~ s/ /%20/g;
    $value = '' if $value eq 'http://';
    return $value;
}

sub _scrub_trailing_space_linebreaks {
    _scrub_linebreaks( _scrub_trailing_leading_ws( $_[0] ) );
}

sub _scrub_linebreaks_multi_ws {

    # eliminate leading/trailing spaces, turn multiple white space into
    # line breaks, and eliminate line breaks
    #(including line breaks) into a single space
    my $value = ( _scrub_trailing_leading_ws( $_[0] ) );
    $value =~ s/[\cM\n\r]+/ /g;       #no line breaks
    $value =~ s/(\xc2\xa0)+/ /g;    #nonbreaking spaces
    $value =~ s/\s+/ /g;
    $value;
}

#updated my original, more naive version with this more robust version,
#including parsing of various unicode newline-ish characters:
#http://www.onlamp.com/pub/a/onlamp/2006/08/17/understanding-newlines.html
#but not for \x{2028}|\x{2029}, which clip parens under Perl 5.6.1
#and removed \x{0085}|\x{000C} too, since they croak under 5.6.0 without use utf8.
sub _scrub_normalize_linebreaks {
    my $text = shift;
    my $ALT_NL = "\n" eq "\012" ? "\015" : "\012";
    $text =~ s/\015\012|$ALT_NL/\n/go;
    return $text;
}

sub _scrub_html_encode {
    my $value = shift;
    $value =~ s/&/&amp;/g;
    $value =~ s/"/&quot;/g;
    $value =~ s/>/&gt;/g;
    $value =~ s/</&lt;/g;
    $value;
}

sub _scrub_make_number {    #returns undef if not a number
    if ( defined $_[0]
        && $_[0] =~ m{\A\s*(\+|\-)?\s*([\d\,]+[.]?\d*|[.]\d+)\s*\z} )
    {
        my $n = $2;
        $n =~ s/,//g;
        return $1 ? ( $1 . $n ) + 0 : $n + 0;
    }
    return undef;
}

sub _scrub_url_safe {       #for slugs
    my $value = shift;
    $value =~ s{\s+}{\-}g;
    $value =~ s{[^a-zA-Z0-9\-_]}{}g;
    $value;
}

sub _scrub_filename {       #same as above, but convert dots to dashes
    my $value = shift;
    $value =~ s{[\s.]+}{\-}g;
    $value =~ s{[^a-zA-Z0-9\-_]}{}g;
    $value;
}

sub _safe_filename_from_path {
    my $filename = lc shift;
    my $suffix = ( $filename =~ /([.][a-z0-9]+)$/ ) ? lc $1 : '';
    $filename =~ s{.*[\\/]}{}msg;    # clear directory path
    my $basename = ( $filename =~ m{(.*)$suffix$} ) ? $1 : '';
    $basename = _scrub_filename($basename);
    $basename ||= 'document';
    $filename = "$basename$suffix";
}

sub _current_site {
    my $app = shift;
    my $site;
    if ( $app->can('current_site') ) {
        $site = $app->current_site;
    }
    else {
        require BigMed::Site;
        my $site_id = $app->path_site();
        defined( $site = BigMed::Site->fetch($site_id) ) or return;
        if ( !$site ) {
            $app->set_error(
                head => 'BM_HEAD_Unknown site',
                text => 'BM_TEXT_Unknown site'
            );
            $app->error_stop;
        }
    }
    return $site;
}

1;

__END__

=head1 NAME

BigMed::App::Web::Parse - Reads and validates form submissions to the
Big Medium web application

=head1 DESCRIPTION

In BigMed::App::Web, the C<parse> method is used to load and validate
form submissions to the Big Medium web application.
BigMed::App::Web::Parse provides the routines that do this.

This documentation describes the available parse types and the options that
they accept via BigMed::App::Web's C<parse> method.

=head1 USAGE

=head2 Registering C<parse> routines

BigMed::App::Web registers the BigMed::App::Web::Parse's parse
routines via the C<register_all> method:

    BigMed::App::Web::Parse->register_all();

=head2 Accessing C<parse> routines

Access to these routines is via the BigMed::App::Web C<parse> method:

    $app->parse('field type', 'field name', \%options);

The parse method returns the validated value (depending on the field
type, this typically includes some value cleanup, like html encoding,
whitespace tidying, etc).

If the value is not valid or if no such field was included in the html
form, the parse method returns undef as the first
value and a (unescaped, unlocalized) error message in the second value.

=head2 Option parameters

The C<parse> method allows you to include a reference to an options
hash. Some field types accept additional custom options, but all parsers
accept these key/value pairs:

=over 4

=item * required => 1

A boolean value that tells the field whether the value is required. If
true and there is no value, the parse method will flag it as an invalid
value.

=back

=head1 The Parse Types

These are the available prompt types. The parse type should be included
as the first argument in a call to C<parse>.

=head2 C<alignment>

Parsed as C<simple_text>. If no options are supplied, constrains values to
this default set of values: 'default', 'left', 'center', 'right'.

=head2 C<body_position>

Constrains options to values in the options attribute (if called via
BigMed::App's parse_submission method, these default options are
supplied automatically: 'above','block','below','hidden'). If the value is
"block" then the paragraph number is also read, constrained to be a positive
integer, and appended like C<block:3>.

=head2 C<boolean>

Returns a true value if the field has a true value, or a (defined) false
value if not true.

=head2 C<dir_path>

Cleanup: Removes leading and trailing spaces, double slashes, line
breaks and any trailing slashes.

Validation: Makes sure that it's an absolute path (starts with slash or,
for Windows systems, has a format like C:\path\to\dir or
\\volumename\path\to\dir). Also makes sure
that it passes the C<bm_untaint_filepath> test in BigMed::DiskUtil, which
prevents the following characters from being included in the path:

    .. | ; < > ` * ( ) [ ] { } $ \n \r

=head2 C<dir_url>

Cleanup: Removes leading and trailing spaces, line
breaks and any trailing slashes.

Validation: Makes sure that it starts with http:// or https://
followed by a valid host name.

=head2 C<document>

Processes a file_upload field and, on success, returns an array reference
with three items: a safe filename (based on the file's original name),
the path to the temporary file, and a file handle for the temporary
file.

The returned array reference is empty if no file is uploaded.

Note that this almost certainly requires some post-parse handling
to move the file into its correct location and convert the value
into a plain old filename (see e.g. BigMed::Media::Document's
post_parse callback routine).

Also, the required aption does not work as usual for this field. If you
require a value for this field, this should be handled in a post_parse
callback.

Cleanup: The filename is changed to lower-case and all non-alphanumeric
and non-hyphen/non-underscore characters are removed, with spaces and
periods changed to hyphens.

Validation: The file suffix must be alphanumeric, and a file suffix is
required. The suffix must be an approved file type. The file
cannot exceed the size limit.

The document parser accepts one optional parameter:

=over 4

=item * size_limit => $kilobytes

If specified, determines a custom size limit for the file upload, or
the default specified by ADMIN_DOCLIMIT is used.

=back

=head2 C<email>

Cleanup: Removes leading and trailing spaces before validating. May
modify/simplify certain email addresses (e.g. rfc822 email addresses
that include comments, or a "name" <address> format).

Validation: Checks to make sure that it's a valid rfc822 e-mail address.

=head2 C<id>

Cleanup: Removes all spaces before validating.

Validation: Requires a positive integer. Exception is the empty string
which is returned as-is: ''

=head2 C<kilobytes>

Cleanup: Removes all spaces before validating.

Validation: Requires a number zero or greater. (One exception: if required
flag is not set, will return empty string if the value is empty.) Converts
value to kilobytes if "megabytes" is selected as the unit.

=head2 C<number_integer_positive>

Cleanup: It eliminates any whitespace before doing validation.

Validation: Must be a positive integer. (One exception: if required flag is not
set, will return empty string if the value is empty.)

=head2 C<password>

Cleanup: It eliminates line breaks and trailing spaces, and encodes
html (that last bit is a little weird, I know... it's a legacy thing).

Validation: None.

=head2 C<raw_text>

Removes trailing and leading spaces and normalizes line breaks.

=head2 C<rich_text>

Removes trailing and leading spaces and normalizes line breaks for the
text submitted for the selected rich text filter. Verifies that the filter
name corresponds with a filter registered with BigMed::Filter.
The final string is formatted in Big Medium's rich_text format:

    FILTER_NAME:TEXT

So, the text "Hello, World." submitted via RawHTML would be:

    RawHTML:Hello, World.

Note that the returned text is not filtered. The filter is run at build
time, not at submission.

=head2 C<rich_text_brief>

Same parsing as c<rich_text> above.

=head2 C<rich_text_public>

Same parsing as c<rich_text> above, but specific to Markdown. The final string
is formatted in Big Medium's rich_text format:

    Markdown:Hello, World.

=head2 C<simple_text>

Cleanup: Get rid of leading/trailing spaces and all linebreaks. All whitespace
(including non-breaking spaces) are turned into a single space. Encodes html.

Validation: None

=head2 C<sort_order>

Cleanup: Parses the sort_order prompt into Big Medium's internal sort_order
string format.

Validation: None.

=head2 C<time_offset>

Cleanup: Formats the time into +/-H:MM or +/-HH:MM format.

Validation: Requires that the sign be either + or -, that H be an integer
between 0 and 23 and that MM be a number between 0 and 59.

=head2 C<url_safe>

Cleanup: Removes leading/trailing white space before validating.

Validation: Does not allow any characters other than alphanumerics, hyphens,
or underscores.

=head2 C<username>

Cleanup: It eliminates line breaks and trailing spaces, and encodes html.

Validation: None.

=head2 C<value_freeform>

Parses a text list into an array, splitting on line breaks.

Cleanup: It eliminates leading/trailing white space from each item and throws
out any empty lines.

Validation: None.

=head1 SEE ALSO

BigMed::App::Web

=head1 AUTHOR & COPYRIGHTS

This module and all Big Medium modules are copyright Josh Clark
and Global Moxie. All rights reserved.

Use of this module and the Big Medium content
management system are governed by Global Moxie's software licenses
and may not be used outside of the terms and conditions outlined
there.

For more information, visit the Global Moxie website at
L<http://globalmoxie.com/>.

Big Medium and Global Moxie are service marks of Global Moxie
and Josh Clark. All rights reserved.

=cut