# Copyright 2002-2008 Josh Clark and Global Moxie, LLC. This code cannot be
# redistributed without permission from globalmoxie.com.  For more
# information, consult your Big Medium license.
#
# $Id: Sitemap.pm 3233 2008-08-21 12:47:26Z josh $

package BigMed::Format::Sitemap;
use strict;
use warnings;
use utf8;
use Carp;
use base qw(BigMed::Format);
use BigMed::DiskUtil qw(bm_file_path bm_untaint_filepath bm_delete_file);

my $MAP = 'BigMed::Format::Sitemap';
my $LIMIT_FIRST_SITEMAP     = 100;       #for performance using defer_overflow
my $LIMIT_OVERFLOW_SITEMAPS = 10_000;    #protocol limits to 49,999
my $INDEX_ORPHAN = 60 * 60 * 24;         #test for old site index maps

###########################################################
# TEMPLATE REGISTRATION
###########################################################

$MAP->register_format(
    'Sitemap',
    suffix         => 'xml',
    is_active => \&can_build
);

sub can_build {
    my $context = shift;
    return 0 if $context->level ne 'top';
    
    #don't bother if page directory is not within home dir url
    my $site = $context->site;
    my $hdir = $site->homepage_url;
    my $pdir = $site->html_url;
    return ($pdir =~ /\A\Q$hdir\E/) ? 1 : 0;
}

#sitemap_index template not assigned to any levels, so only run via
#level_extras after the sitemap template and overflows are built.
#This allows it to create a complete list of the sitemap files with
#accurate dates and with orphans removed.

$MAP->register_template(
    {   name        => 'sitemap',
        description => 'SITEMAP_TMPL_DESC_sitemap',
        level       => 'top',
        filename    => 'bm~sitemap',
    },
    {   name         => 'sitemap_index',
        description  => 'SITEMAP_TMPL_DESC_sitemap_index',
        level_extras => \&build_sitemap_index,
        filename     => 'bm~sitemap_index',
    },
);

###########################################################
# JUST ONE WIDGET: CONTENT (AND ITS OVERFLOW)
###########################################################

$MAP->add_collector_group(
    name     => 'sitemap_items',
    sort     => ['mod_time'],
    order    => ['descend'],
    overflow => {
        filename       => 'bm~sitemap',
        template       => 'sitemap',
        collector      => \&collect_item,
        assembler      => \&assemble_overflow,
        page_limit_num => $LIMIT_OVERFLOW_SITEMAPS,
    }
);
$MAP->add_widget(
    name         => 'content',
    collects_for => 'sitemap_items',
    collector    => \&collect_item,
    assembler    => \&assemble_item,
    priority     => 100,                    #do first, before sitemap
    limit        => $LIMIT_FIRST_SITEMAP,
);

sub collect_item {
    my ( $self, $context, $obj ) = @_;
    my %flag = $obj->flags;
    return 0 if $flag{'hideall'} || $flag{'html_znosearch'};

    my $site = $context->site;
    my $url  = $obj->active_page_url(
        $site,
        {   section => $context->section,
            rcache  => $context->relation_cache,
            rkids   => $context->active_descendants,
        }
      )
      or return 0;
    
    #active_page_url returns the actual page itself, if active,
    #even for aliased section pages. That's not actually what we want,
    #though.
    $url = $context->section->alias
      if ($obj->subtype eq 'section' && $context->section->alias);

    #url has to begin with homepage directory, where the sitemaps are stored;
    #required by the sitemaps protocol:
    #http://www.sitemaps.org/protocol.php#location
    return 0 if !$url || index( $url, $site->homepage_url ) != 0;
    $url = $MAP->escape_xml($url);

    #w3c date format
    my $date = w3c_date( $obj->mod_time );

    #sections should reflect the mod date of most recent child page so
    #that links are included
    my $rsections = $context->stash('MAP_section_dates') || {};
    foreach my $secid ( $obj->sections ) {
        my $sec = $site->section_obj_by_id($secid) or next;
        foreach my $p ( $sec->parents, $secid ) {
            $rsections->{$p} = $date
              if !$rsections->{$p} || $date gt $rsections->{$p};
        }
    }
    $context->set_stash( 'MAP_section_dates', $rsections );

    my $priority;
    if ( $obj->subtype eq 'section' ) {
        $priority = 1;
        my $sid = ( $obj->sections )[0];
        $date = $rsections->{$sid} if $sid && $rsections->{$sid} gt $date;
    }
    else {
        $priority =
          defined $obj->priority
          ? ( $obj->priority / 1000 )
          : 0.5;
    }

    return $self->add_to_collection(
        {   loc      => $url,
            lastmod  => $date,
            priority => $priority,
        }
    );
}

sub assemble_item {
    my ( $widget, $context, $rheading ) = @_;
    return $context->build_markup( 'wi_content.tmpl',
        items => $widget->collection, );
}

sub assemble_overflow {
    my ( $widget, $context, $ritems, $pnum, $total_num ) = @_;
    return $context->build_markup( 'wi_content.tmpl', items => $ritems, );
}

###########################################################
# BUILD META FILE OF SITEMAP INDICES
###########################################################

sub build_sitemap_index {
    my ($context) = @_;
    return if $context->level ne 'top';

    my $site     = $context->site;
    my $smap_dir = $site->homepage_dir;

    my $DIR;
    opendir($DIR, $smap_dir)
      or return BigMed->set_io_error( $DIR, 'opendir', $smap_dir, $! );
    my $dot = BigMed->bigmed->env('DOT');
    my @paths =
      map { bm_file_path( $smap_dir, $_ ) }
      grep { /bm\Q$dot\Esitemap\Q$dot\Ep/mso } readdir($DIR);
    closedir($DIR);
    unshift @paths, bm_file_path( $smap_dir, "bm${dot}sitemap.xml" );
    
    my $now      = time;
    my $base_url = $site->homepage_url;
    my @sitemaps;
    foreach my $smap ( @paths ) {
        my $path = bm_untaint_filepath($smap) or next;
        next if !-e $path;
        my $mod_epoch = ( stat($path) )[9];
        if ( $now - $mod_epoch > $INDEX_ORPHAN ) {
            bm_delete_file($path);
            next;
        }

        my %time;
        @time{qw(second minute hour day month year)} =
          ( gmtime($mod_epoch) )[0 .. 5];
        $time{year}  += 1900;
        $time{month} += 1;
        my $date = w3c_date( BigMed->bigmed_time(%time) );

        ( my $file = $smap ) =~ s{\A.*[/\\]}{}ms;
        my $url = $MAP->escape_xml("$base_url/$file");

        push @sitemaps, { loc => $url, lastmod => $date };
    }

    my $index =
      $context->build_markup( 'wi_sitemaps.tmpl', sitemaps => \@sitemaps );
    return { content => $index };
}

###########################################################
# MISC HELPERS
###########################################################

sub w3c_date {
    my $date = shift || BigMed->bigmed_time();
    $date =~ s/ /T/ms;
    return ( $date . '+00:00' );
}

1;

__END__

=head1 BigMed::Format::Sitemap

=head1 Synopsis

=head1 Description

=head1 Author & Copyrights

This module and all Big Medium modules are copyright Josh Clark
and Global Moxie. All rights reserved.

Use of this module and the Big Medium content
management system are governed by Global Moxie's software licenses
and may not be used outside of the terms and conditions outlined
there.

For more information, visit the Global Moxie website at
L<http://globalmoxie.com/>.

Big Medium and Global Moxie are service marks of Global Moxie
and Josh Clark. All rights reserved.

=cut

