# Copyright 2002-2008 Josh Clark and Global Moxie, LLC. This code cannot be
# redistributed without permission from globalmoxie.com.  For more
# information, consult your Big Medium license.
#
# $Id: File.pm 3323 2008-09-09 15:36:56Z josh $

package BigMed::Driver::File;
use strict;
use utf8;
use Carp;
use base qw(BigMed::Driver);
use BigMed::DiskUtil
  qw(bm_file_path bm_load_file bm_write_file bm_confirm_dir bm_datafile_chmod bm_check_space bm_delete_file bm_untaint_filepath bm_delete_dir bm_copy_file bm_move_file);
use Fcntl qw(:DEFAULT :flock);

my $PERL_5_8 = ( $] >= 5.008 );    #determines utf-8 handling

###########################################################
# PUBLIC METHODS
###########################################################

my $CLASS = __PACKAGE__;
( my $SUBCLASS = $CLASS ) =~ s/^.*:://;

sub new {
    return ( bless {}, shift )->initialize(@_);
}

sub select {
    my $driver = shift;
    my $rindex = $driver->collect_matches(@_) or return undef;
    return $CLASS->new( $driver->bigmed )->set_selection($rindex);
}

sub tune {
    my ( $driver, $key ) = @_;
    my $first  = $driver->selection->[0] or return {};
    my $source = $first->{data_source};
    my $class  = BigMed::Data->source_class($source)
      or croak "Unknown source '$source'";
    my %properties = $class->properties;

    croak 'no key parameter in tune request' if !defined $key;
    croak qq{"$key" is an invalid search column for $source}
      if !defined $properties{$key} || !$properties{$key}->{index};

    my %index;
    my $count = -1;
    foreach my $item ( @{ $driver->selection } ) {
        $count++;
        my $v = $item->{$key};
        if ( !ref $v && defined $v ) {
            push @{ $index{$v} }, $count;
        }
        else {
            foreach ( @{$v} ) {
                push @{ $index{$_} }, $count;
            }
        }
    }
    $driver->{_TUNED_INDEX} = \%index;
    $driver->{_TUNED}       = $key;
    return 1;
}

sub join_points_to {
    my $driver = shift;
    $driver->_join_points( 'points_to', @_ );
}

sub join_pointed_from {
    my $driver = shift;
    $driver->_join_points( 'pointed_from', @_ );
}

sub join_has {
    my ( $driver, $rjoin, $rterms, $rparam, $source ) = @_;
    ref $rjoin eq 'HASH' or croak 'usage: join(\%join, \%terms, \%params);';
    my $key = $rjoin->{key} or croak 'No foreign key column specifed';
    my $roriginals;
    if ( !$source ) {    #requesting via existing driver obj
        $source = $driver->_source_from_selection
          or return $driver->_empty_driver;

        #the key column in the joined objects should match the IDs
        #from the current batch
        $roriginals = $driver->selection;    #save to populate index later
        my @old_id = map { $_->{id} } @$roriginals;
        my %terms;
        if ( ref $rterms eq "HASH" ) {
            %terms = %{$rterms};
        }
        elsif ( !ref $rterms && $rterms ) {
            $terms{id} = $rterms;
        }
        $terms{$key} = \@old_id;
        $rterms = \%terms;
    }
    my $site;
    $site = $rjoin->{site} if $rjoin->{site};
    $site ||= $rterms->{site};
    $rterms->{site} = $site;

    #get the matching index items for the joined class
    my $jsource = $rjoin->{'join'}
      or croak 'join source missing from first param hash';
    $jsource = $jsource->data_source if !ref $jsource;
    my $odriver = $driver->_empty_driver;
    my $rindex = $odriver->collect_matches( $rterms, $rparam, $jsource )
      or return undef;

    #if we have a full index of original selections, load matching
    #selection hashes from the original selection into a source index:
    #a hash indexed by ID of the desired objects -- we do this because some
    #items may appear more than once so we'll need to get at them multiple
    #times.
    my %s_index;
    if ($roriginals) {
        my %meta_by_id = map { $_->{$key} => 1 } @$rindex;
        foreach my $item (@$roriginals) {
            last if !keys %meta_by_id;             #found all, get out quick
            next if !$meta_by_id{ $item->{id} };
            $s_index{ $item->{id} } = $item;
            delete $meta_by_id{ $item->{id} };
        }
    }

    #shmoosh the ids from the keys into a new driver object;
    #optimize the shmooshing method depending on unique/limit settings
    my ( $u, $lim ) = ( $rjoin->{unique}, $rjoin->{limit} );
    my $rselect;
    if ( $u && $lim ) {                            #unique and limited
        $rselect =
          _map_has_u_limit( $rindex, \%s_index, $source, $key, $site, $lim );
    }
    elsif ($u) {                                   #unique, unlimited
        $rselect = _map_has_u( $rindex, \%s_index, $source, $key, $site );
    }
    elsif ($lim) {                                 #limited, not unique
        $rselect =
          _map_has_limit( $rindex, \%s_index, $source, $key, $site, $lim );
    }
    else {                                         #full results
        $rselect = _map_has( $rindex, \%s_index, $source, $key, $site );
    }
    my $select = $driver->_empty_driver;
    $select->set_selection($rselect);
    $select;
}

sub _map_has_u_limit {
    my ( $rindex, $rs_index, $source, $key, $site, $lim ) = @_;
    my %got_it;
    my @selection;
    foreach my $rec (@$rindex) {
        $rec->{$key} ||= '';
        next if !$rec->{$key} || $got_it{ $rec->{$key} };
        push @selection, $rs_index->{ $rec->{$key} }
          || { id => $rec->{$key}, data_source => $source, site => $site };
        $got_it{ $rec->{$key} } = 1;
        last if @selection >= $lim;
    }
    \@selection;
}

sub _map_has_u {
    my ( $rindex, $rs_index, $source, $key, $site ) = @_;
    my %got;
    [   map {
            $rs_index->{ $_->{$key} }
              || { id => $_->{$key}, data_source => $source, site => $site }
          }
          grep {
                $_->{$key} && !$got{ $_->{$key} }
              ? $got{ $_->{$key} } = 1
              : 0
          } @$rindex
    ];

}

sub _map_has_limit {
    my ( $rindex, $rs_index, $source, $key, $site, $lim ) = @_;
    my @selection;
    foreach my $rec (@$rindex) {
        next if !$rec->{$key};
        push @selection, $rs_index->{ $rec->{$key} }
          || { id => $rec->{$key}, data_source => $source, site => $site };
        last if @selection >= $lim;
    }
    \@selection;
}

sub _map_has {
    my ( $rindex, $rs_index, $source, $key, $site ) = @_;
    [   map {
            $_->{$key}
              ? (
                $rs_index->{ $_->{$key} }
                  || {
                    id          => $_->{$key},
                    site        => $site,
                    data_source => $source
                  }
              )
              : ()
          } @$rindex
    ];
}

sub fetch {
    my ( $driver, $rterms, $rargs, $data_source ) = @_;
    my %terms = ref $rterms eq 'HASH' ? %$rterms : ( id => $rterms );
    $rargs ||= {};
    $rargs->{limit} = 1 if !wantarray;

    if ( !$data_source && !@{ $driver->selection } ) {
        return wantarray ? () : "";
    }
    $data_source ||= $driver;

    #an id-only search means that we can go straight to the data
    #record files without searching through the index first. This
    #is only true, though, under these conditions:
    #1) id is the only search term, and it's not a from/to range search
    #   ... or ...
    #   id and site are the only search terms and it's only for one site
    #2) no sorting
    #3) only one data source
    #4) not pulling from existing selection (need to ensure only getting
    #   objects in the selection itself; so we can't do the shortcut here).
    my $id_search =
         $terms{id}
      && ref $terms{id} ne 'HASH'
      && (
        keys %terms == 1
        || (   keys %terms == 2
            && $terms{site}
            && !ref $terms{site} )
      )
      && !$rargs->{sort}
      && !ref $data_source;
    my @objects;
    if ($id_search) {    #grab objects directly

        #handle offset and limit; would be dumb to request an offset
        #in this situation, but hey, that's the way the world works
        my @id = ref $terms{id} eq 'ARRAY' ? @{ $terms{id} } : ( $terms{id} );
        my $offset = $rargs->{offset} || 0;
        while ( $offset > 0 ) {
            shift @id;
            $offset--;
        }
        @id = @id[0 .. $rargs->{limit} - 1]
          if $rargs->{limit} && @id > $rargs->{limit};

        my $class = BigMed::Data->source_class($data_source);
        $terms{site} || $class->systemwide
          or croak
          "Must supply site id to fetch site-specific $class objects";
        my $dir = $driver->data_directory_path(
            { source => $data_source, site => $terms{site} } );
        foreach (@id) {
            defined( my $obj = $driver->load_object( $class, $_, $dir ) )
              or return;
            push( @objects, $obj ) if $obj;    #empty string: doesn't exist
        }
    }
    else {

        #more sophisticated search requested
        my $rindex = $driver->collect_matches( \%terms, $rargs, $data_source )
          or return;
        my ( $last_source, $last_site, $dir, $class ) = ( "", "", "", "" );
        foreach my $item (@$rindex) {
            if ( $last_source ne $item->{data_source}
                || ( $item->{site} && $last_site ne $item->{site} ) )
            {
                $last_source = $item->{data_source};
                $last_site   = $item->{site};
                $class       = BigMed::Data->source_class($last_source);
                $dir         = $driver->data_directory_path(
                    { source => $last_source, site => $last_site } );
            }
            defined( my $obj =
                  $driver->load_object( $class, $item->{id}, $dir ) )
              or return;
            push( @objects, $obj ) if $obj;    # '' means doesn't exist
        }
    }
    return @objects if wantarray;
    return @objects > 0 ? $objects[0] : "";
}

sub next_index {
    my $driver = shift;
    my $index  = $driver->index;
    $driver->set_index( $index + 1 ) if $index < $driver->count;
    return $driver->selection->[$index];
}

sub next {
    my $driver = shift;
    my $index  = $driver->index;
    defined( my $data = $driver->load_indexed_object($index) ) or return;
    $driver->set_index( $index + 1 ) if $index < $driver->count;
    $data = $driver->next if !$data && $index < $driver->count;    #not found
    return $data;
}

sub previous {
    my $driver = shift;
    my $index  = $driver->index - 1;
    defined( my $data = $driver->load_indexed_object($index) )
      or return undef;
    $driver->set_index($index) if $index >= 0;
    if ( !$data && $index >= 0 ) { $data = $driver->previous }     #not found
    $data;
}

sub index {

    #returns the current index value, which is used for next/previous
    #requests against the last selection request.

    my $driver = shift;
    $driver->set_index(0) unless $driver->{_index};
    $driver->{_index};
}

sub set_index {
    my ( $driver, $index ) = @_;
    croak "Index must be positive integer in set_index method"
      if $index =~ /[^0-9]/;
    $driver->{_index} = $index + 0;
    $driver;
}

sub count {
    scalar @{ $_[0]->selection };
}

sub save {
    my $driver   = shift;
    my $data_obj = shift;

    my $bigmed   = $driver->bigmed;
    my $time     = $bigmed->bigmed_time();
    my $data_dir = $driver->data_directory_path($data_obj) or return;

    #populate auto-generated fields if applicable
    $driver->update_id($data_obj) or return undef;
    $data_obj->set_create_time($time) unless $data_obj->create_time;
    return 1 if !$data_obj->is_modified;    #nothing new to save;
    $data_obj->set_mod_time($time);

    #prepare data
    my ( $data_record, $index_line ) = $driver->prep_data_for_save($data_obj);

    #do index first to ensure unique values are in fact unique
    #(no need to check disk space first; add_to_index backs up data index
    #and that will provide any error message we need).
    $driver->add_to_index( $data_dir, $index_line, $data_obj )
      or return;
    bm_write_file( bm_file_path( "$data_dir", $data_obj->id . ".cgi" ),
        $data_record, { data => 1 } )
      or return;
    $data_obj->mark_unmodified;
    return 1;
}

sub reindex {
    my ( $driver, $source, $rcols ) = @_;
    my $class = BigMed::Data->source_class($source);
    if ( !$class->systemwide ) {    #reindex for all sites
        my $sites =
          BigMed::Site->select( undef, { sort => 'name', order => 'ascend' } )
          or return;
        my $s;
        while ( $s = $sites->next ) {
            $class->call_trigger( 'before_reindex', $s, $rcols );
            $driver->_do_reindex( $class, $s->id, $rcols ) or return;
        }
        return if !defined $s;
    }
    else {
        $class->call_trigger( 'before_reindex', undef, $rcols );
        $driver->_do_reindex( $class, undef, $rcols ) or return;
    }

    return 1;
}

sub _do_reindex {
    my ( $driver, $class, $site_id, $rcols ) = @_;
    my $sample = $class->new();
    $sample->set_site($site_id) if $site_id;
    my $data_dir = $driver->data_directory_path($sample) or return;

    my %col = ref $rcols eq 'HASH' ? %{$rcols} : ();
    my %set;
    foreach my $k ( keys %col ) {
        $set{"set_$k"} = $col{$k};
    }

    #get all files and plug into index
    my $count = 0;
    my $DIR;
    opendir( $DIR, $data_dir )
      or return $driver->set_io_error( $DIR, 'opendir', $data_dir, $! );
    while ( defined( my $file = readdir($DIR) ) ) {
        next if $file !~ /\A(\d+)[.]cgi\z/ms;
        my $obj =
            $site_id
          ? $class->fetch( { id => $1, site => $site_id } )
          : $class->fetch( { id => $1 } );
        return if !defined $obj;
        next if !$obj || !$obj->id;    #trouble loading file, or file is empty
        if ( $count % 25 == 0 ) {
            $class->call_trigger( 'mid_reindex', $site_id, $count );
        }
        $count++;
        if (%set) {
            while ( my ( $set, $val ) = each %set ) {
                $obj->$set($val);
            }
            $obj->save or return;
        }
        else {
            bm_check_space( $data_dir, 100 ) or return;
            my ( $data_record, $index_line ) =
              $driver->prep_data_for_save($obj);
            $driver->add_to_index( $data_dir, $index_line, $obj, 'reindex' )
              or return;
        }
    }
    closedir($DIR);

    return 1;
}

sub trash {

    #removes the object in $_[1] from the data store and clears
    #the values in the data object.
    #Does not handle any of the updates to public pages or files;
    #just manages the removal from the data store and index

    my $driver   = shift;
    my $data_obj = shift;
    ( ref($data_obj) && $data_obj->isa('BigMed::Data') )
      or croak "trash method requires a data object";

    #sites get their entire data directory deleted
    if ( $data_obj->isa('BigMed::Site') && $data_obj->id ) {
        my $sitedir = $driver->site_directory_path($data_obj);
        bm_delete_dir($sitedir) or return;
    }

    #put the essential object data into an array reference
    #and send it to the delete_objects utility routine
    my $id     = $data_obj->id;
    my $site   = $data_obj->systemwide ? undef : $data_obj->site;
    my $source = $data_obj->data_source;
    $driver->delete_objects( [$source, $id, $site] ) or return;

    return 1;
}

sub trash_all {

    #deletes all of the objects in the driver's selection
    #array. As with trash, does not handle any of the updates
    #to public pages or files; just manages the removal
    #from the data store and index.
    #the selection of the driver is zeroed out.
    #
    #site objects get special handling; their entire data directory is zapped

    my $driver = shift;

    defined( $driver->call_trigger('before_trash_all') ) or return undef;

    my @delete;
    foreach my $item ( @{ $driver->selection } ) {
        my $class = BigMed::Data->source_class( $item->{data_source} );
        my $site = $class->systemwide ? undef : $item->{site};
        push @delete, [$item->{data_source}, $item->{id}, $site];

        if ( $class eq 'BigMed::Site' && $item->{id} ) {
            my $sitedir = $driver->site_directory_path( $item->{id} );
            bm_delete_dir($sitedir) or return;
        }

    }

    $driver->delete_objects(@delete) or return undef;
    defined( $driver->call_trigger('after_trash_all') ) or return undef;

    #clear the selection
    $driver->set_selection( [] );
    return 1;
}

sub update_id {
    my ( $driver, $data_obj ) = @_;
    my $floor    = $data_obj->id;
    my $data_dir = $driver->counter_directory_path or return undef;
    my $counter  = $driver->_counter_file_path( $data_dir, $data_obj );

    my $COUNTER;
    sysopen( $COUNTER, $counter, O_RDWR | O_CREAT, bm_datafile_chmod() )
      or return $driver->set_io_error( $COUNTER, "open", $counter, $! );
    flock( $COUNTER, LOCK_EX )
      or return $driver->set_io_error( $COUNTER, "lock_ex", $counter, $! );

    my $num = <$COUNTER> || 0;
    if ( $floor && $floor <= $num ) { close($COUNTER); return $floor; }
    elsif ($floor) { $num = $floor }
    else           { $num++; }

    unless ( bm_check_space( $data_dir, 5 ) ) {
        close($COUNTER);
        return undef;
    }
    seek( $COUNTER, 0, 0 )
      or return $driver->set_io_error( $COUNTER, "rewind", $counter, $! );
    truncate( $COUNTER, 0 )
      or return $driver->set_io_error( $COUNTER, "truncate", $counter, $! );
    print $COUNTER $num, "\n"
      or return $driver->set_io_error( $COUNTER, "write", $counter, $! );
    close($COUNTER)
      or return $driver->set_io_error( $COUNTER, "close", $counter, $! );

    $data_obj->set_id($num);
    return $num;
}

sub has_valid_id {
    my ( $driver, $data_obj ) = @_;
    my $id = $data_obj->id or return undef;
    my $data_dir = $driver->counter_directory_path or return undef;
    my $counter = $driver->_counter_file_path( $data_dir, $data_obj );
    my $last = ( bm_load_file($counter) )[0];
    if ( $id > $last ) {
        $data_obj->set_id(undef);
        return undef;
    }
    return 1;
}

sub selection_class {
    my $driver = shift;
    my $source = $driver->_source_from_selection
      or return '';    #no items in selection
    return BigMed::Data->source_class($source);
}

sub _counter_file_path {
    my ( $driver, $data_dir, $data_obj ) = @_;
    my $counter = "last_" . $data_obj->data_source . ".cgi";
    bm_file_path( $data_dir, $counter );
}

###########################################################
# PRIVATE :: DATA PREPARATION
# Supports save method
###########################################################

sub prep_data_for_save {

    #private, should only be called by this driver subclass

    my ( $driver, $data_obj ) = @_;

    my %properties = $data_obj->properties;
    my @data_file;
    my @index_line;
    foreach my $element ( $data_obj->data_columns ) {
        my $ref_method = "_ref_$element";    #to get a reference value
        if ( !defined $data_obj->$ref_method() ) {
            push( @index_line, "" )
              if $properties{$element}->{index};
            next;
        }
        my $packed = $driver->pack(
            $properties{$element}->{type},
            $data_obj->$ref_method(),
            $properties{$element}
        );
        push( @data_file,  $properties{$element}->{name} . "::" . $packed );
        push( @index_line, $packed )
          if $properties{$element}->{index};
    }
    return ( join( "\n", @data_file ), join( "_!!!_", @index_line ) );
}

###########################################################
# PRIVATE :: SELECTION ACCESSORS
###########################################################

sub selection {
    my $driver = shift;
    $driver->set_selection( [] ) if !$driver->{_selection};
    $driver->{_selection};
}

sub set_selection {
    my ( $driver, $selection ) = @_;
    croak 'Selection must be an array reference in set_selection method'
      if ref $selection ne 'ARRAY';
    $driver->{_selection} = $selection;
    $driver->set_index(0);
    $driver;
}

###########################################################
# PRIVATE :: JOIN_POINTS SUPPORT ROUTINES
###########################################################

sub _join_points {    #INTERNAL METHOD
    my ( $driver, $from_or_to, $rjoin, $rterms, $rparam, $source ) = @_;
    ref $rjoin eq 'HASH' or croak 'usage: join(\%join, \%terms, \%params);';
    $rjoin->{join} or croak 'join_points operation requires a join param';

    my $site = $rjoin->{site};
    $site ||= $rterms->{site} if ref $rterms eq 'HASH';
    $site or croak 'join_points operations require a site id';

    my @old_id;
    my $orig_selection;
    if ( !$source ) {    #requesting via existing driver obj
        $orig_selection = $driver->selection;
        $source         = $driver->_source_from_selection
          or return $driver->_empty_driver;
        @old_id = map { $_->{id} } @{ $driver->selection };
    }

    #gather a selection of pointer links between the two object types
    my ( $jsource, $pselection ) =
      $driver->_pointer_source_and_selection( $from_or_to, $source, $rjoin,
        $site, \@old_id );
    $jsource or return $driver->_empty_driver;
    return $driver->_empty_driver if !@$pselection;

    #build a map of related objects to the wanted objects; can be multiple
    #wanted objects for each related object, so every object has an array of
    #IDs of wanted objects.
    my %lookup;
    my ( $want_id, $other_id ) =
      $from_or_to eq 'points_to'
      ? ( 'source_id', 'target_id' )
      : ( 'target_id', 'source_id' );
    foreach my $obj (@$pselection) {
        push @{ $lookup{ $obj->{$other_id} } }, $obj->{$want_id};
    }

    #find and sort the related objects
    my %terms = ref $rterms eq 'HASH' ? %$rterms : ();
    $terms{site} = $site;
    my @limit_id =
        ref $terms{id} eq 'ARRAY' ? @{ $terms{id} }
      : $terms{id} ? ( $terms{id} )
      :              ();
    if (@limit_id) {    #specifying specific objects, filter found objects
        my %wanted = map { $_ => 1 } @limit_id;
        my @winnowed = ( grep { $wanted{$_} } keys %lookup )
          or return $driver->_empty_driver;    #none left
        $terms{id} = \@winnowed;
    }
    else {
        $terms{id} = [keys %lookup];           #IDs of all found objects
    }
    my $o_driver = $driver->_empty_driver;
    my $obj = $o_driver->collect_matches( \%terms, $rparam, $jsource );

    #if we have a full index of original selections, load matching
    #selection hashes from the original selection into a source index:
    #a hash indexed by ID of the desired objects -- we do this because some
    #items may appear more than once so we'll need to get at them multiple
    #times.
    my %s_index;
    if ($orig_selection) {
        my %meta_by_id = map { ( $_ => 1 ) } map { @$_ } values %lookup;
        foreach my $item (@$orig_selection) {
            last if !keys %meta_by_id;             #found all, get out quick
            next if !$meta_by_id{ $item->{id} };
            $s_index{ $item->{id} } = $item;
            delete $meta_by_id{ $item->{id} };
        }
    }

    #collect the IDs of the desired objects into selection arrays;
    #sorted according to the order of the related objects specified
    #in the rparam arg.
    my ( $u, $lim ) = ( $rjoin->{unique}, $rjoin->{limit} );
    my $selection;
    if ( $u && $lim ) {                            #unique and limited
        $selection =
          _map_point_u_lim( $obj, \%lookup, \%s_index, $source, $site, $lim );
    }
    elsif ($u) {                                   #unique, unlimited
        $selection =
          _map_point_u( $obj, \%lookup, \%s_index, $source, $site );
    }
    elsif ($lim) {                                 #not unique, limited
        $selection =
          _map_point_lim( $obj, \%lookup, \%s_index, $source, $site, $lim );
    }
    else {                                         #full results
        $selection = _map_point( $obj, \%lookup, \%s_index, $source, $site );
    }

    my $weak_select = $driver->_empty_driver;
    $weak_select->set_selection($selection);
    $weak_select;
}

sub _pointer_source_and_selection {
    my $driver = shift;
    my ( $from_or_to, $source, $rjoin, $site, $rold_id ) = @_;

    #get the search terms to find the pointer objects
    my ( $want, $other ) =
      $from_or_to eq 'points_to'
      ? ( 'source', 'target' )
      : ( 'target', 'source' );

    my %pfind = (
        ( $want . '_table' ) => $source,
        site                 => $site,
    );
    $pfind{ $want . '_id' } = $rold_id if $rold_id;
    $pfind{type} = $rjoin->{relation} if $rjoin->{relation};
    my $join = $rjoin->{'join'} or croak 'join class/selection missing';
    my $jsource;    #get joined class ($join can be a selection or a class)
    if ( ref $join eq __PACKAGE__ ) {
        $jsource = $join->_source_from_selection or return ();
        $pfind{ $other . '_id' } = [map { $_->{id} } @{ $join->selection }];
    }
    else {
        $jsource = $join->data_source;
    }
    $pfind{ $other . '_table' } = $jsource;

    #gather the pointers; can provide a selection of pointers via pselect
    my $pointer = $rjoin->{'pselect'} || BigMed::Pointer->data_source;
    my $pdriver = $driver->_empty_driver;
    my $select  = $pdriver->collect_matches( \%pfind, undef, $pointer )
      or return ();
    ( $jsource, $select );
}

#the following map methods take a selection of related objects from
#_join_points and an ID map and returns a selection array for the
#wanted "primary" objects. These are "weak" selection arrays, though,
#because they contain only id, site and data_source info -- none of the
#object metadata that a selection array normally has.

#This means that the resulting selection object will be able to do
#useful select requests. However, the following methods should work
#just fine: fetch, next, previous, join_has, join_points_to and
#join_pointed_from.

#This is probably good enough; if not, it's possible to do a bit of work
#to make the selections fill out the regular metadata, but it means
#returning to the table index to find the info for all of the IDs.
#For now, it doesn't seem to be worth the performance cost.

sub _map_point_u_lim {    #unique and limited
    my ( $selection, $rlookup, $rs_index, $source, $site, $limit ) = @_;
    my @all_id;
    my %got_it;
  GATHERALL: foreach my $obj (@$selection) {
        foreach my $id ( @{ $rlookup->{ $obj->{id} } } ) {
            next if $got_it{$id};
            push @all_id,
              ( $rs_index->{$id}
                  || { id => $id, data_source => $source, site => $site } );
            $got_it{$id} = 1;
            last GATHERALL if @all_id >= $limit;
        }
    }
    \@all_id;
}

sub _map_point_u {    #unique, unlimited
    my ( $selection, $rlookup, $rs_index, $source, $site ) = @_;
    my @all_id;
    my %got_it;
    foreach my $obj (@$selection) {
        push @all_id, map {
            $rs_index->{$_}
              || { id => $_, data_source => $source, site => $site }
          }
          grep { $got_it{$_} ? 0 : ( $got_it{$_} = 1 ) }
          @{ $rlookup->{ $obj->{id} } };
    }
    \@all_id;
}

sub _map_point_lim {    #limited but not unique
    my ( $selection, $rlookup, $rs_index, $source, $site, $lim ) = @_;
    my @all_id;
  GATHERALL: foreach my $obj (@$selection) {
        foreach my $id ( @{ $rlookup->{ $obj->{id} } } ) {
            push @all_id,
              ( $rs_index->{$id}
                  || { id => $id, data_source => $source, site => $site } );
            last GATHERALL if @all_id >= $lim;
        }
    }
    \@all_id;
}

sub _map_point {    #unlimited not unique
    my ( $selection, $rlookup, $rs_index, $source, $site ) = @_;
    my @all_id =
      map {
        $rs_index->{$_}
          || { id => $_, data_source => $source, site => $site }
      }
      map { @{ $rlookup->{ $_->{id} } } } @$selection;
    \@all_id;
}

sub _source_from_selection {
    my $driver  = shift;
    my $rselect = $driver->selection;
    my $first   = $rselect->[0] or return undef;
    $first->{data_source};
}

sub _empty_driver {
    my $driver = shift;
    ( my $subclass = ref $driver ) =~ s/^.*:://;
    BigMed::Driver->new( $subclass, $driver->bigmed );
}

###########################################################
# PRIVATE :: INDEX SEARCH ROUTINES
###########################################################

sub collect_matches {

    #private, should only be called by this driver subclass
    my $driver      = shift;
    my %terms       = ref $_[0] eq 'HASH' ? %{ $_[0] } : ();
    my %args        = ref $_[1] eq 'HASH' ? %{ $_[1] } : ();
    my $data_source = $_[2] || $driver;

    #get the properties of the source class elements
    #to identify numeric columns when matching against ranges and sorting;
    #if we're looking at multiple sources, use the class of the first
    #source.
    my $source;
    my @sources =
      ref $data_source eq 'ARRAY'
      ? @$data_source
      : ($data_source);
    if ( ref $sources[0] eq __PACKAGE__ ) {
        my $first = $sources[0]->selection->[0] or return [];
        $source = $first->{data_source};
    }
    else {
        $source = $sources[0];

        #and while we're here...
        #don't do a descending sort for mod_time
        #because File driver indices are already
        #stored in that order. This isn't about saving
        #time on the sort, which is fast, but on the
        #ability to save time on the actual search, since
        #a no-sort search lets us do fast searches with the
        #'limit' argument.
        #This can only work, though, if mod_time is the
        #only search column; don't strip out the sort
        #if there are other search columns, because then
        #we could get bad sort results.
        #NOT CURRENTLY DOING THIS FOR searches against previous selections
        #We could though: store info about the sort format in
        #the driver object, and then do some kind of comparison
        #to winnow the selection. The in-memory search is fast
        #enough, though, that we won't worry about it for now.

        #TO TEST:
        #Properly dropping sort methods when we're supposed to,
        #and keeping them when we're not?
        if ( $args{sort} && $args{order} ) {
            my @sort =
              ref $args{sort} eq 'ARRAY'
              ? @{ $args{sort} }
              : ( $args{sort} );
            if ( $sort[0] eq 'mod_time' && !$sort[1] ) {
                my @order =
                  ref $args{order} eq 'ARRAY'
                  ? @{ $args{order} }
                  : ( $args{order} );
                if ( $order[0] eq 'descend' ) {
                    shift @order;
                    shift @sort;
                    $args{sort}  = \@sort;
                    $args{order} = \@order;
                }
            }
        }
    }

    #sort out offset and limit; can limit match times if there's
    #no sort order. Also, need to do it before sort in case we
    #don't have any remaining results; we can save the sort overhead.
    if ( defined $args{offset} ) {
        $args{offset} =~ /\D/
          and croak "'offset' argument must be an integer >= 0";
    }
    else {
        $args{offset} = 0;
    }
    if ( defined $args{limit} ) {
        $args{limit} =~ /^\d+$/ && $args{limit} > 0
          or croak "'limit' argument must be a positive integer";
    }
    else {
        $args{limit} = 0;    #will get set to size of @results later
    }

    #determine max number to match; can only set a max if we
    #aren't sorting and if we have a limit, otherwise set to 0
    my $max =
       !$args{sort} && $args{limit}
      ? $args{offset} + $args{limit}
      : 0;

    #vet the search terms and build the search methods
    my $class = BigMed::Data->source_class($source)
      or croak "Unknown source '$source'";
    my %properties = $class->properties;
    my ( $is_match, $rterms, $rcheck ) =
      $driver->construct_search( \%terms, \%properties, $args{'any'} );
    %terms = %$rterms;
    my %check = %$rcheck;

    #get the matching set
    my @results;
    if ( ref $sources[0] eq __PACKAGE__ ) {

        #do search against previously loaded selection
        my $rselection;
        my $tkey = $sources[0]->{_TUNED};
        if ( $tkey && defined $check{$tkey} && !ref $terms{$tkey} ) {
            my $ritems = $driver->{_TUNED_INDEX}->{ $terms{$tkey} };
            $rselection =
              $ritems ? [@{ $sources[0]->selection }[@{$ritems}]] : [];
        }
        else {
            $rselection = $sources[0]->selection;
        }

        if ( !$max ) {
            @results =
              $is_match
              ? grep { $is_match->($_) } @{$rselection}
              : @{$rselection};
        }
        elsif ($is_match) {    #max and matching routine
            foreach ( @{$rselection} ) {
                push @results, $_ if $is_match->($_);
                last if @results >= $max;
            }
        }
        else {    #max, no matching routine, just lop off existing
            my @current = @{$rselection};
            my $count   = scalar @current;
            my $top     = ( $max <= $count ? $max - 1 : $count - 1 );
            @results = @current[0 .. $top];
        }
    }
    else {        #load the data from disk
        my @site =
          ref $terms{site} eq "HASH"
          ? keys %{ $terms{site} }
          : ( $terms{site} );
        my $remaining = $max;
        foreach my $source (@sources) {
            my $data_class = BigMed::Data->source_class($source)
              or croak "Unknown source '$source'";
            my @site_cycle = $data_class->systemwide ? (0) : (@site);
            my @index_info = $data_class->index_columns or next;
            foreach my $site (@site_cycle) {
                my $dir = $driver->data_directory_path(
                    {   source => $source,
                        site   => $site,
                    }
                ) or return undef;
                my $file = bm_file_path( $dir, "index.cgi" );
                my $rresults = $driver->load_index(
                    file        => $file,
                    source      => $source,
                    is_match    => $is_match,
                    rindex_info => \@index_info,
                    rterms      => \%terms,
                    max         => $remaining
                ) or return undef;
                push( @results, @$rresults );
                if ($max) {
                    last if @results >= $max;
                    $remaining -= @$rresults;
                }
            }
            last if $max && @results >= $max;
        }
    }
    return [] if $args{offset} >= @results;

    #do the sort
    if ( $args{sort} && @results > 0 ) {
        @results = $driver->sort_index( \@results, \%args, \%properties );
    }

    #return the appropriate offset
    $args{limit} = @results unless $args{limit};
    if ( $args{limit} <= @results - $args{offset} ) {
        return [@results[$args{offset} .. $args{offset} + $args{limit} - 1]];
    }
    else {
        return [@results[$args{offset} .. @results - 1]];
    }
}

sub construct_search {
    my ( $driver, $rterms, $rprops, $any ) = @_;

    my $rcheck;
    ( $rterms, $rcheck ) = $driver->build_search_methods( $rterms, $rprops );
    my %terms = %{$rterms};
    return ( undef, $rterms, $rcheck ) if !%terms;    #no search
    my %check = %{$rcheck};

    my $or_search = $any && ( !ref $any || @{$any} > 1 );
    my @and_cols;

    my $WARN = $^W;
    $^W = 0;
    my $and_match = sub {
      COLS: foreach (@and_cols) {
            my $v = $_[0]->{$_};    #2x speed increase by not dereferencing
            if ( !ref $v ) {
                return if !defined $v || !$check{$_}->( $v, $terms{$_} );
            }
            else {
                my $want  = $terms{$_};
                my $check = $check{$_};
                foreach ( @{$v} ) {
                    next COLS if $check->( $_, $want );
                }
                return;             #no match
            }
        }
        return 1;                   #all match
    };
    $^W = $WARN;

    #for AND match, we're done here
    if ( !$or_search ) {
        @and_cols = keys %terms;
        return ( $and_match, $rterms, $rcheck );
    }

    # The rest is only for OR matches
    my @or_cols;
    if ( ref $any ) {               #mix of AND and OR search
        @or_cols = grep { $terms{$_} } @{$any};
        my %or = map { $_ => 1 } @or_cols;
        @and_cols = grep { !$or{$_} } keys %terms;
    }
    else {                          #OR match for all fields (no and_cols)
        @or_cols = keys %terms;
    }

    $^W = 0;
    my $or_match = sub {
      COLS: foreach (@or_cols) {
            my $v = $_[0]->{$_};    #2x speed increase by not dereferencing
            if ( !ref $v ) {
                return 1
                  if defined $v && $check{$_}->( $v, $terms{$_} );    #match
            }
            else {
                my $want  = $terms{$_};
                my $check = $check{$_};
                foreach ( @{$v} ) {
                    return 1 if $check->( $_, $want );                #match
                }
            }
        }
        return;    #no match
    };
    $^W = $WARN;

    if ( !@and_cols ) {    #OR search on all columns
        return ( $or_match, $rterms, $rcheck );
    }
    else {
        my $and_or_match = sub {
            $and_match->(@_) or return;
            return $or_match->(@_);
        };
        return ( $and_or_match, $rterms, $rcheck );
    }
}

sub build_search_methods {

    #private, should be called only by this driver subclass

    #Accepts info about the columns and terms to match, and massages
    #the terms into more search-friendly form and also returns
    #code references to routines to conduct each term/column match

    #Returns:
    #1. Hash reference to a massaged terms hash (keys are columns,
    #   and values have matching info to use in combination with the
    #   search methods delivered in...
    #2. Hash of search methods to use for each column match. Keys
    #   are the column name, and values are the code references.

    #Requires:
    # $_[0]: Driver object
    # $_[1]: Reference to terms hash. Hash should be in format
    #        specified for the terms hash by the select method.
    # $-[2]: Reference to the properties hash for the data type to
    #        search against.

    my $driver     = shift;
    my %terms      = %{ shift @_ };
    my %properties = %{ shift @_ };
    my %check;
    my %final;

    while ( my ( $column, $match_value ) = each(%terms) ) {
        next if !defined $match_value;
        my $rprops = $properties{$column};
        $rprops && $rprops->{index}
          or croak "'$column' is an invalid search column";

        if ( ref $match_value eq 'ARRAY' ) {
            my %term_key;
            foreach ( @{$match_value} ) {
                $term_key{ lc($_) } = 1 if defined $_;
            }
            next if !%term_key;
            $final{$column} = \%term_key;
            $check{$column} = \&is_or_match;
        }
        elsif ( ref $match_value eq 'HASH' ) {
            if (   defined $match_value->{from}
                && defined $match_value->{to} )
            {
                $check{$column} =
                  $driver->is_numeric( $rprops->{type} )
                  ? \&is_nrange_match
                  : \&is_range_match;
            }
            elsif ( defined $match_value->{from} ) {
                $check{$column} =
                  $driver->is_numeric( $rprops->{type} )
                  ? \&is_nge_match
                  : \&is_ge_match;
            }
            elsif ( defined $match_value->{to} ) {
                $check{$column} =
                  $driver->is_numeric( $rprops->{type} )
                  ? \&is_nle_match
                  : \&is_le_match;
            }
            else {
                next;
            }
            $final{$column}->{from} = lc( $match_value->{from} )
              if $match_value->{from};
            $final{$column}->{to} = lc( $match_value->{to} )
              if $match_value->{to};
        }
        elsif ( ref $match_value eq 'Regexp' ) {
            $check{$column} = \&is_regexp_match;
            $final{$column} = $match_value;
        }
        else {
            $check{$column} = \&is_scalar_match;
            $final{$column} = lc($match_value);
        }
    }
    return ( \%final, \%check );
}

sub is_regexp_match { $_[0] =~ $_[1] }
sub is_scalar_match { lc( $_[0] ) eq $_[1] }

sub is_range_match {
    lc( $_[0] ) ge $_[1]->{from} && lc( $_[0] ) le $_[1]->{to};
}
sub is_ge_match { lc( $_[0] ) ge $_[1]->{from} }
sub is_le_match { lc( $_[0] ) le $_[1]->{to} }

sub is_nrange_match {
    $_[0] >= $_[1]->{from} && $_[0] <= $_[1]->{to};
}
sub is_nge_match { $_[0] >= $_[1]->{from} }
sub is_nle_match { $_[0] <= $_[1]->{to} }
sub is_or_match  { $_[1]->{ lc( $_[0] ) } }

###########################################################
# PRIVATE :: SORTING ROUTINE
###########################################################

sub sort_index {

    #private, should only be called by this driver subclass

    #Accepts a reference to an array of index hashes to sort,
    #a hash reference identifying what/how-to search,
    #and a reference to the property types:

    # Returns:
    # An array of the sorted values

    # Requires:
    # $_[0]: Driver object
    # $_[1]: Reference to array of index hashes to sort.
    # $_[2]: Reference to hash containing...
    #        {sort} key: A single value, or a hash array
    #                    of values, indicating which columns
    #                    to sort.
    #        {order} key: Optional. Either 'ascend' or 'descend'
    #                     ascend is the default.
    # $_[3]: Reference to the %properties hash for the data
    #        type to be sorted.

    my $driver     = shift;
    my @results    = @{ shift @_ } or return ();
    my %args       = %{ shift @_ };
    my %properties = %{ shift @_ };
    my @sort_col =
      ref $args{sort} eq "ARRAY"
      ? @{ $args{sort} }
      : ( $args{sort} );
    my @sort_order =
      ref $args{order} eq "ARRAY"
      ? @{ $args{order} }
      : ( $args{order} );
    my @method;

    my $i = 0;
    foreach my $col (@sort_col) {

        #build sort routines, using closures, based on the ascend/
        #descend value plus the numeric/non-numeric value of the
        #column to sort.
        unless ( $properties{$col} && $properties{$col}->{type} ) {
            $i++;
            next;
        }
        $sort_order[$i] = "ascend" unless $sort_order[$i];
        my $j       = $i;
        my $numeric = $driver->is_numeric( $properties{$col}->{type} );
        if ( $sort_order[$i] eq "descend" && $numeric ) {
            push( @method, sub { $_[1]->[$j + 1] <=> $_[0]->[$j + 1] } );
        }
        elsif ( $sort_order[$i] eq "descend" ) {
            push( @method, sub { $_[1]->[$j + 1] cmp $_[0]->[$j + 1] } );
        }
        elsif ( $sort_order[$i] eq "ascend" && $numeric ) {
            push( @method, sub { $_[0]->[$j + 1] <=> $_[1]->[$j + 1] } );
        }
        else {    #ascend
            push( @method, sub { $_[0]->[$j + 1] cmp $_[1]->[$j + 1] } );
        }
        $i++;
    }

    return @results if @method == 0;

    #do the sort using schwartzian transform method
    return map $_->[0], sort {
        local $^W = 0;    #turn off warnings for noisy undef issues
        my $value = 0;
        foreach (@method) {
            $value = $_->( $a, $b ) or next;
            last;
        }
        $value;
      }
      map {
        my $this     = $_;
        my @sort_map = ($this);
        foreach my $col (@sort_col) {
            push( @sort_map,
                ref( $this->{$col} ) eq "ARRAY"
                ? lc( $this->{$col}->[0] )
                : lc( $this->{$col} ) );
        }
        [@sort_map];
      } @results;
}

###########################################################
# PRIVATE :: INDEX UPDATING/SAVING and UNPACKING
###########################################################

sub add_to_index {    #private, should only be called by this driver subclass
    my ( $driver, $data_dir, $index_line, $data_obj, $reindex ) = @_;
    my $chmod = bm_datafile_chmod();
    my $file = bm_untaint_filepath( bm_file_path( $data_dir, "index.cgi" ) )
      or return;
    my $not_unique =
      _build_not_unique_check( $data_obj, $index_line );    #code ref

    my $backup = $file . '_bak';

    #perl 5.8+ handles utf8 on its own via the PerlIO encoding;
    #But perl 5.6 needs to convert on the fly; to make sure we save the
    #same bytes, convert index_line to raw bytes to match lines in
    #existing files.
    if ( !$PERL_5_8 ) {
        $index_line = pack "C*", unpack "U0C*", $index_line;
    }

    #create a lock file to avoid dupes
    my $LOCKFILE;
    my $lockpath = bm_untaint_filepath( bm_file_path( $data_dir, 'lock' ) )
      or return;
    sysopen( $LOCKFILE, $lockpath, O_WRONLY | O_TRUNC | O_CREAT, 0777 )
      or return $driver->set_io_error( $LOCKFILE, 'open', $lockpath, $! );
    flock( $LOCKFILE, LOCK_EX )
      or return $driver->set_io_error( $LOCKFILE, 'lock_ex', $lockpath, $! );
    print $LOCKFILE 'lock'
      or return $driver->set_io_error( $LOCKFILE, 'write', $lockpath, $! );

    #open the main index file
    my $DATAINDEX;
    sysopen( $DATAINDEX, $file, O_WRONLY | O_CREAT, $chmod )
      or return $driver->set_io_error( $DATAINDEX, 'open', $file, $! );

    #copy to backup before flocking to avoid trouble in some windows servers
    bm_copy_file( $file, $backup )
      or return $driver->set_io_error( undef, 'copy', $file, $! );

    #lock and set binmode on index file
    flock( $DATAINDEX, LOCK_EX )
      or return $driver->set_io_error( $DATAINDEX, 'lock_ex', $file, $! );
    binmode( $DATAINDEX, ":utf8" ) if $PERL_5_8;

    #use backup to read original data
    my $ORIG;
    sysopen( $ORIG, $backup, O_RDONLY )
      or return $driver->set_io_error( $ORIG, 'open', $backup, $! );
    flock( $ORIG, LOCK_SH )
      or return $driver->set_io_error( $ORIG, 'lock_sh', $backup, $! );
    binmode( $ORIG, ":utf8" ) if $PERL_5_8;

    #truncate the main index file, start fresh
    truncate( $DATAINDEX, 0 )
      or return $driver->set_io_error( $DATAINDEX, 'truncate', $file, $! );

    #if reindex is true, that means that we cannot assume that the object
    #is the most recent; cannot just add as first line. benchmarking
    #shows no significant cost to these reindex checks for regular saves.
    my $mod_time;
    if ($reindex) {
        $mod_time = $data_obj->mod_time;
    }
    else {
        print $DATAINDEX $index_line, "\n"
          or return $driver->set_io_error( $DATAINDEX, 'write', $file, $! );
    }

    #rewrite all lines, checking for uniqueness along the way
    my $id_check  = $data_obj->id . q{_};
    my $id_length = length $id_check;
  OLDWORDS: while ( my $line = <$ORIG> ) {
        next if substr( $line, 0, $id_length ) eq $id_check;    #skip self
        if ( $not_unique->($line) ) {                           #restore orig
            my $column = $not_unique->($line);    #not-unique column
            return $driver->_not_unique_restore( $data_obj, $column, $backup,
                $file );
        }
        if ( $reindex && ( $line =~ /\A\d+_!!!_([^_]+)/ )[0] le $mod_time ) {
            print $DATAINDEX $index_line, "\n"
              or return $driver->_write_error_restore( $backup, $file );
            $reindex = 0;
        }
        print $DATAINDEX $line                    #already includes line feed
          or return $driver->_write_error_restore( $backup, $file );
    }

    #no more original entries, append any additional lines
    close($ORIG);
    if ($reindex) {    #never added the index line back in
        print $DATAINDEX $index_line, "\n"
          or return $driver->_write_error_restore( $backup, $file );
    }

    #all done; delete backup and close index and lockfile
    bm_delete_file($backup)
      or return $driver->set_io_error( undef, 'unlink', $backup, $! );
    close($DATAINDEX)
      or return $driver->set_io_error( undef, 'close', $file, $! );
    close($LOCKFILE)
      or return $driver->set_io_error( undef, 'close', $lockpath, $! );
    return 1;
}

sub _write_error_restore {    #problem writing to main index
    my ( $driver, $backup, $main_index ) = @_;
    $driver->set_io_error( undef, 'write', $main_index, $! );
    bm_move_file( $backup, $main_index ) if -e $backup;
    return;
}

sub _not_unique_restore {     #problem writing to main index
    my ( $driver, $data_obj, $column, $backup, $main_index ) = @_;
    $driver->set_error(
        head => 'DRIVER_ERR_HEAD_Data Record Contains a Duplicate Value',
        text => [
            'DRIVER_ERR_TEXT_Data Record Contains a Duplicate Value',
            $driver->bigmed->language( $data_obj->data_label ),
            $data_obj->id,
            $column
        ],
    );
    bm_move_file( $backup, $main_index ) if -e $backup;
    return;
}

sub _build_not_unique_check {

    #returns a not_unique code reference that will check an array reference of
    #index-line values against the index line of the object to be saved.
    #returns a true value (specifically, the name of the not-unique column)
    #if a column that must be unique is not unique.
    #returns 0 if no matches found on unique columns.

    my ( $data_obj, $index_line ) = @_;
    my @obj_values = map { $_ ? lc $_ : $_ } split /_!!!_/, $index_line;
    my @unique_cols = $data_obj->unique_indices;    #indices of columns
                                                    #in index-line arrays
    my ( @test_index, @test_value );
    foreach my $col_index (@unique_cols) {
        next
          if !defined $obj_values[$col_index]
              || $obj_values[$col_index] eq '';     #don't test if no value

        push @test_index, $col_index;
    }
    return sub { 0 }
      if !@test_index;

    my @index_col = map { $_->{name} } $data_obj->index_columns;
    return sub {
        my $line = shift;
        chomp $line;
        my @thisline = split( /_!!!_/, $line );
        foreach my $i (@test_index) {
            return $index_col[$i]
              if defined $thisline[$i]
                  && lc( $thisline[$i] ) eq $obj_values[$i];
        }
        return 0;
    };
}

sub load_index {
    my $driver = shift;
    my %args   = @_;
    my ( $file, $source, $is_match, $rindex_info, $rterms, $max ) =
      @args{qw(file source is_match rindex_info rterms max)};
    return [] if !-e $file;

    my @results;
    my $DATAINDEX;
    sysopen( $DATAINDEX, $file, O_RDONLY )
      or return $driver->set_io_error( $DATAINDEX, "open", $file, $! );
    flock( $DATAINDEX, LOCK_SH )
      or return $driver->set_io_error( $DATAINDEX, "lock_sh", $file, $! );
    binmode( $DATAINDEX, ":utf8" ) if $PERL_5_8;
    while ( my $line = <$DATAINDEX> ) {
        $line = pack "U0C*", unpack "C*", $line if !$PERL_5_8;
        my $rhash =
          $driver->unpack_index_line( $line, $source, $rindex_info, $is_match,
            $rterms )
          or next;
        push( @results, $rhash );
        last if $max && @results >= $max;
    }
    close($DATAINDEX)
      or return $driver->set_io_error( undef, "close", $file, $! );
    \@results;
}

sub unpack_index_line {

    #Unpacks a line from a File-formatted index and puts it into
    #a hash reference appropriate for use in File's matching and sorting
    #routines. (Along the way, adds data_source to the hash ref).
    #
    #Returns: The hash reference, or undef if the line is not a match.
    #
    #Requires:
    # $_[0] driver
    # $_[1] data string: the line from the File-formatted index
    # $_[2] source: The data source of the class for which we're loading data
    # $_[3] class' index column info
    # $_[4] is_match code reference, generated in the collect_matches routine
    #       (optional).
    # $_[5] hash reference to search terms (vetted and buffed in
    #       build_search_methods routine)
    #
    # Note: if $_[4] is_match and $_[5] rterms are not both provided,
    # the routine will declare the line a true match and return the
    # hash reference for the line.
    #
    # Example:
    # $rhash = $driver->unpack_index_line(
    #   $index_line,
    #   DATACLASS->data_source(),
    #   [ DATACLASS->index_columns() ]
    #   \&is_match,
    #   \%search_terms,
    # );

    my ( $driver, $line, $source, $rindex_cols, $is_match, $rterms ) = @_;
    chomp $line;
    my @values = split( /_!!!_/, $line );
    my %index;

    ## to improve the speed, we just load the columns for the match
    ## terms. Cuts time significantly, since the speed bottleneck is
    ## the unpacking routine.

    if ( $is_match && $rterms ) {    #load columns for match terms only
        my $i = 0;
        foreach my $col (@$rindex_cols) {
            my $colname = $col->{name};
            if ( defined $rterms->{$colname} ) {
                $index{$colname} =
                  $col->{must_unpack}
                  ? BigMed::Elements->unpack( $col->{type}, $values[$i],
                    $col )
                  : $values[$i];
            }
            $i++;
        }
        return if !$is_match->( \%index );
    }

    #this is a match, so load all remaining columns and return
    #the index hash for this line.
    my $i = 0;
    foreach my $col (@$rindex_cols) {
        my $colname = $col->{name};
        if ( !exists $index{$colname} ) {
            $index{$colname} =
              $col->{must_unpack}
              ? BigMed::Elements->unpack( $col->{type}, $values[$i], $col )
              : $values[$i];
        }
        $i++;
    }
    $index{data_source} = $source;
    \%index;
}

###########################################################
# PRIVATE :: LOAD OBJECT ROUTINES
###########################################################

sub load_indexed_object {
    my ( $driver, $index ) = @_;
    my $rselect = $driver->selection;
    return '' if $index >= @$rselect || $index < 0;
    my $item = $rselect->[$index];
    return $driver->load_object(
        BigMed::Data->source_class( $item->{data_source} ),
        $item->{id},
        $driver->data_directory_path(
            { source => $item->{data_source}, site => $item->{site} }
        )
    );
}

sub load_object {

    # @_ = [0]driver, [1]class, [2]id, [3]dir
    my %properties = $_[1]->properties;
    my $data       = $_[1]->new();
    ( $_[2] && $_[3] ) or croak 'load_object requires dir and id';
    return q{} if !-e ( my $file = bm_file_path( $_[3], "$_[2].cgi" ) );

    foreach ( bm_load_file($file) ) {
        next unless /\A(.*?)::(.*)/ && $properties{$1};
        my $setter = "set_$1";
        my $val    = $2;         #5.8.0 mangles utf8 if use $1/$2 directly
        $data->$setter(
            $properties{$1}->{must_unpack}
            ? BigMed::Elements->unpack( $properties{$1}->{type},
                $val, $properties{$1} )
            : $val
        );
    }
    $data->mark_unmodified;
    return $data;
}

###########################################################
# PRIVATE :: SUPPORT ROUTINES FOR TRASH METHODS
###########################################################

sub delete_objects {

    #accepts an array of array references (data_source, id, site) and:
    #1) deletes the data records associated with each array reference.
    #2) removes index entries for each data record.

    my $driver = shift;
    my @args   = @_;

    #vet the data objects to delete first
    #(don't want to die in the middle of deleting, or the index
    #could get out of whack).
    #Along the way, collect the data dir paths and create a flag
    #hash to mark ids to delete so we can update the index
    #in one pass no matter how many objects we're deleting.
    #All of this is handled in flag_dirs_objects_to_delete
    #routine.

    my ( $r_data_dir, $r_to_delete ) =
      $driver->flag_dirs_objects_to_delete(@args);

    my %data_dir  = %$r_data_dir;
    my %to_delete = %$r_to_delete;

    #delete the data record files and collect any failures.
    #Again, don't want to die in the middle, or the index could
    #get out of whack.

    my @failed;
    foreach my $item (@args) {
        my ( $source, $id, $site ) = @$item;
        my $dir = $site ? $data_dir{$source}->{$site} : $data_dir{$source};
        my $file = bm_file_path( $dir, "$id.cgi" );
        if ( -e $file ) {
            unless ( bm_delete_file($file) ) {

                #collect the failure and unflag for deletion from index
                push( @failed, [$source, $id, $site, $!] );
                if ($site) {
                    delete $to_delete{$source}->{$site}->{$id};
                }
                else {
                    delete $to_delete{$source}->{$id};
                }
            }
        }
    }

    #remove items from indices; remove_from_indices does not
    #quit on error. Just stows error messages into the error
    #array, so we can collect and act on them later.

    $driver->remove_from_indices( \%data_dir, \%to_delete );
    unless (@failed) {
        return $driver->error ? undef : 1;
    }

    #otherwise, collect failed deletion messages into error message
    my @lang_failure_list;
    foreach my $failure (@failed) {
        my ( $source, $id, $site, $error ) = @$failure[0 .. 3];
        my $class = BigMed::Data->source_class($source);
        my $label = $class->data_label || '';
        push @lang_failure_list,
          ['DRIVER_ERR_Data record could not be removed', $id, $label,
            $error];
    }
    my $list_text = $driver->bigmed->language_list(@lang_failure_list);
    return $driver->set_error(
        head => 'DRIVER_ERR_One or More Records Could Not Be Deleted',
        text => [
            'DRIVER_ERR_TEXT_We ran into trouble deleting the following file(s)',
            $list_text
        ]
    );
}

sub flag_dirs_objects_to_delete {

    #This is a utility routine that supports the delete_objects
    #routine.

    #Steps through the arguments supplied to delete_objects,
    #vets them for form/validity, collects the data directories
    #into a hash, and flags data ids to be deleted in a hash.

    #Returns references to the data_dir hash and the to_delete hash.

    my $driver = shift;
    my %data_dir;
    my %to_delete;
    foreach my $item (@_) {
        ref $item eq "ARRAY"
          or croak "delete_objects argument not an array reference";
        my ( $source, $id, $site ) = @{$item}[0 .. 2];

        #Lookup the data directory for this object if we don't
        #already have it and put it into the data_dir hash.
        #Flag the object for deletion.

        #Need to use two different approaches for site-specific
        #versus systemwide data.

        if ($site) {
            unless ( $data_dir{$source} && $data_dir{$source}->{$site} ) {

                #don't have the directory yet; look it up.
                $data_dir{$source}->{$site} = $driver->data_directory_path(
                    { source => $source, site => $site } );
            }
            $to_delete{$source}->{$site}->{$id} = 1;    #flag for delete
        }
        else {    #must be a system-wide data type
            my $class = BigMed::Data->source_class($source)
              or croak "delete_objects: unknown data source '$source'";
            $class->systemwide
              or croak "delete_objects requires site id for "
              . "site-specific classes";
            unless ( $data_dir{$source} ) {
                $data_dir{$source} =
                  $driver->data_directory_path( { source => $source } );
            }
            $to_delete{$source}->{$id} = 1;    #flag for delete
        }
    }
    ( \%data_dir, \%to_delete );
}

sub remove_from_indices {

    #utility routine that supports the delete_objects routine.
    #removes items from indices; if any of the i/o operation fail,
    #don't leave the routine yet; complete the run, but set error
    #messages

    #the data directories are assumed to exist, and no effort
    #is made to vet or create the directories; safe assumption,
    #since that's handled by the flag_dirs_objects_to_delete
    #routine which gathers the data_dir hash.

    #arguments:
    #[0] driver
    #[1] ref to hash of data directories
    #[2] ref to hash of delete_flags

    my $driver    = $_[0];
    my %data_dir  = %{ $_[1] };
    my %to_delete = %{ $_[2] };
    my $chmod     = bm_datafile_chmod();

    foreach my $source ( keys %to_delete ) {

        #need to find out if this is systemwide or not.
        my $class = BigMed::Data->source_class($source);
        my @sites = $class->systemwide
          ? (0)    #forces to go through site loop but doesn't trip the
                   #site-specific directory
          : keys %{ $to_delete{$source} };
        foreach my $site (@sites) {

            #get the data directory for either site-specific or systemwide
            my $dir =
              $site ? $data_dir{$source}->{$site} : $data_dir{$source};

            #get the hash reference of ids to delete for this source/site
            my $delete =
              $site ? $to_delete{$source}->{$site} : $to_delete{$source};
            next unless keys %$delete;    #probably failed in record delete

            #update the index file; don't 'return'
            #on set_io_error, just add the info to the error string.

            #open, read and reset the index file
            my $file =
              bm_untaint_filepath( bm_file_path( $dir, "index.cgi" ) )
              or next;
            my $DATAINDEX;
            sysopen( $DATAINDEX, $file, O_RDWR | O_CREAT, $chmod )
              or $driver->set_io_error( $DATAINDEX, "open", $file, $! )
              or next;
            flock( $DATAINDEX, LOCK_EX )
              or $driver->set_io_error( $DATAINDEX, "lock_ex", $file, $! )
              or next;

            #for perl 5.8+, always use this binmode as a matter of course;
            #for perl 5.6, do nothing; should work fine as raw bytes
            binmode( $DATAINDEX, ":utf8" ) if $PERL_5_8;

            my @dataindex = <$DATAINDEX>;
            seek( $DATAINDEX, 0, 0 )
              or $driver->set_io_error( $DATAINDEX, "rewind", $file, $! )
              or next;
            truncate( $DATAINDEX, 0 )
              or $driver->set_io_error( $DATAINDEX, "truncate", $file, $! )
              or next;

            #update and rewrite the index file, skipping entries to be
            #deleted
            foreach my $line (@dataindex) {
                my $id = ( split( /_!!!_/, $line ) )[0];
                next
                  if ( $site && $to_delete{$source}->{$site}->{$id} )
                  || ( !$site && $to_delete{$source}->{$id} );
                print $DATAINDEX $line
                  or $driver->set_io_error( $DATAINDEX, "write", $file, $! )
                  or last;
            }
            close($DATAINDEX)
              or $driver->set_io_error( $DATAINDEX, "close", $file, $! )
              or next;
        }
    }
}

###########################################################
# PRIVATE :: MISC ROUTINES
###########################################################

sub counter_directory_path {

    #private, should only be called by this driver subclass

    my $driver = shift;
    my $bigmed = $driver->bigmed;
    my $dir    = bm_file_path( $bigmed->env('MOXIEDATA'), "counters" );
    bm_confirm_dir( $dir, { build_path => 1, data => 1 } )
      or return undef;
    $dir;
}

1;
__END__

