#! /usr/bin/env perl
#
# Andrew Janke - a.janke@gmail.com
# http://a.janke.googlepages.com/
#
# Copyright 2006 Andrew Janke
#
# Permission to use, copy, modify, and distribute this software and its
# documentation for any purpose and without fee is hereby granted,
# provided that the above copyright notice appear in all copies.
# The author makes no representations about the suitability of this
# software for any purpose.  It is provided "as is" without express
# or implied warranty.

$| = 1;


use strict;
use warnings "all";
use Getopt::Long;
use Pod::Usage;
use File::Basename;
use File::Temp qw/ tempdir /;

# until I get organised and do this properly
my $PACKAGE = &basename($0);
my $VERSION = '1.3.0';
my $PACKAGE_BUGREPORT = '"Andrew Janke" <a.janke@gmail.com>';

my($Help, $Usage, $me, @opt_table, %opt, @infiles, $avgfile);

$me = &basename($0);
%opt = (
   'verbose' => 0,
   'clobber' => 0,
   'fake' => 0,
   'tmpdir' => undef,
   'batch' => 0,
   'batch_lastjobid' => undef,
   'avgnum' => 20,
   'sdfile' => undef,
   'filelist' => undef,
   'robust' => 0,
   'robust_cutoff' => [1.0, 1.2],
   'output_type' => '-short',
   'max_buffer_size_in_kb' => 1048576,
   );

# check arguments
&GetOptions(
   'help|?' => \$opt{'help'},
   'man' => \$opt{'man'},
   'v|verbose' => \$opt{'verbose'},
   'c|clobber' => \$opt{'clobber'},
   'version' => sub { &print_version_info },
   'f|fake' => \$opt{'fake'},
   't|tmpdir=s' => \$opt{'tmpdir'},
   'b|batch' => \$opt{'batch'},
   'batch_lastjobid=s' => \$opt{'batch_lastjob'},

   'byte' => sub { $opt{'output_type'} = '-byte' },
   'short' => sub { $opt{'output_type'} = '-short' },
   'long' => sub { $opt{'output_type'} = '-long' },
   'float' => sub { $opt{'output_type'} = '-float' },
   'double' => sub { $opt{'output_type'} = '-double' },

   'sdfile=s' => \$opt{'sdfile'},
   'avgnum=i' => \$opt{'avgnum'},
   'filelist=s' => \$opt{'filelist'},

   'robust' => \$opt{'robust'},
   'robust_cutoff=f{2}' => \@{$opt{'robust_cutoff'}},

   ) or pod2usage('-verbose' => 1) && exit;


# handle -man, -help or missing args
pod2usage('-verbose' => 1) if $opt{'help'};
pod2usage('-exitstatus' => 0, '-verbose' => 2) if $opt{'man'};
pod2usage('-verbose' => 0) && exit if ($#ARGV < 0);

# get and check for files
$avgfile = pop(@ARGV);

# setup infiles
if(defined $opt{'filelist'}){
   @infiles = split(/\n/, `cat $opt{'filelist'}`);
   }
else{
   @infiles = @ARGV;
   }
if(-e $avgfile && !$opt{'clobber'}){
   die "$me: $avgfile exists, -clobber to overwrite\n\n";
   }
if(defined($opt{'sdfile'}) && -e $opt{'sdfile'} && !$opt{'clobber'}){
   die "$me: $opt{'sdfile'} exists, -clobber to overwrite\n\n";
   }

# make tmpdir
if(defined($opt{'tmpdir'})){
   &do_cmd('mkdir', '-p', $opt{'tmpdir'});
   $opt{'tmpdir'} = &tempdir("$me-XXXXXXXX", TMPDIR => 1, DIR => $opt{'tmpdir'}, CLEANUP => 1);
   }
else{
   $opt{'tmpdir'} = &tempdir("$me-XXXXXXXX", TMPDIR => 1, CLEANUP => 1);
   }

# if robust, we need a standard deviation file
if($opt{'robust'} && !defined($opt{'sdfile'})){
   $opt{'sdfile'} = "$opt{'tmpdir'}/sdfile.mnc";
   }

# start averaging files in groups of "avgnum"
my($i, $start, $stop, @avgfiles, $nfiles, @s1_preq, @s2_preq);
@s1_preq = @s2_preq = ();
for($i=0; $i<=$#infiles; $i+=$opt{'avgnum'}){

   $start = $i;
   $stop = ($i+$opt{'avgnum'} - 1 < $#infiles) ? $i + $opt{'avgnum'} - 1 : $#infiles;

   print STDOUT "Averaging files [$start - $stop]/$#infiles ";

   @avgfiles = @infiles[$start..$stop];

   &do_cmd_batch("CALC$$-$i", 'none',
      'minccalc',
      '-max_buffer_size_in_kb', $opt{'max_buffer_size_in_kb'},
      '-quiet', '-clobber',
      '-double',
      '-expression', "s1 = s2 = 0; for {i in [0:len(A))} { v=A[i]; if (!isnan(v)) { s1 = s1 + v; s2 = s2 + v*v; } };",
      '-outfile', 's1', "$opt{'tmpdir'}/s1-u-$i.mnc",
      '-outfile', 's2', "$opt{'tmpdir'}/s2-u-$i.mnc",
      @avgfiles);

   print STDOUT "[CALC] ";

   # add to current total
   if($i == 0){
      &do_cmd_batch("MVs1$$-$i","CALC$$-$i",
         'mv', '-f', "$opt{'tmpdir'}/s1-u-$i.mnc", "$opt{'tmpdir'}/s1.mnc");
      &do_cmd_batch("MVs2$$-$i","CALC$$-$i",
         'mv', '-f', "$opt{'tmpdir'}/s2-u-$i.mnc", "$opt{'tmpdir'}/s2.mnc");
      }
   else{
      &do_cmd_batch("ADDs1$$-$i", join(",", ("CALC$$-$i", @s1_preq)),
         'mincmath',
         '-max_buffer_size_in_kb', $opt{'max_buffer_size_in_kb'},
         '-quiet', '-double',
         '-add', "$opt{'tmpdir'}/s1.mnc", "$opt{'tmpdir'}/s1-u-$i.mnc", "$opt{'tmpdir'}/s1-t.mnc");
      &do_cmd_batch("ADDs2$$-$i", join(",", ("CALC$$-$i", @s2_preq)),
         'mincmath',
         '-max_buffer_size_in_kb', $opt{'max_buffer_size_in_kb'},
         '-quiet', '-double',
         '-add', "$opt{'tmpdir'}/s2.mnc", "$opt{'tmpdir'}/s2-u-$i.mnc", "$opt{'tmpdir'}/s2-t.mnc");

      &do_cmd_batch("MVs1$$-$i","ADDs1$$-$i",
         'mv', '-f', "$opt{'tmpdir'}/s1-t.mnc", "$opt{'tmpdir'}/s1.mnc");
      &do_cmd_batch("MVs2$$-$i","ADDs2$$-$i",
         'mv', '-f', "$opt{'tmpdir'}/s2-t.mnc", "$opt{'tmpdir'}/s2.mnc");
      }

   push(@s1_preq, "MVs1$$-$i");
   push(@s2_preq, "MVs2$$-$i");
   print STDOUT "[MV-ADD]\n";
   }

# setup output filename
my $iavg = ($opt{'robust'}) ? "$opt{'tmpdir'}/iavg.mnc" : $avgfile;

# average
$nfiles = $#infiles + 1;
&do_cmd_batch("IAVG$$", join(",", @s1_preq),
   'minccalc', '-clobber',
   '-max_buffer_size_in_kb', $opt{'max_buffer_size_in_kb'},
   $opt{'output_type'},
   '-expression', "A[0] / $nfiles",
   "$opt{'tmpdir'}/s1.mnc",
   $iavg);

# standard deviation
if($opt{'sdfile'}){
   my($nfiles_less_one) = $nfiles - 1;
   &do_cmd_batch("ISTD$$", join(",", @s1_preq, @s2_preq),
      'minccalc', '-clobber',
      '-max_buffer_size_in_kb', $opt{'max_buffer_size_in_kb'},
      $opt{'output_type'},
      '-expression', "sqrt((A[1] - A[0]*A[0]/$nfiles) / $nfiles_less_one)",
      "$opt{'tmpdir'}/s1.mnc",
      "$opt{'tmpdir'}/s2.mnc",
      $opt{'sdfile'});
   }

# do robust averaging if required
if($opt{'robust'}){

   my @add_preq = ();
   my @numer_files = ();
   my @denom_files = ();
   foreach $i (@infiles){
      print STDOUT "[ROBUST] Doing $i";

      my $bname = &basename($i);

      &do_cmd_batch("RZ$$-$i", "IAVG$$,ISTD$$",
         'minccalc', '-clobber',
         '-max_buffer_size_in_kb', $opt{'max_buffer_size_in_kb'},
         '-expression',
         "z = (abs(A[2]) < 0.001) ? 0 : abs((A[0] - A[1])/A[2]); " .
         "m = (z < $opt{'robust_cutoff'}[0]) ? 1 : ((z > $opt{'robust_cutoff'}[1]) ? 0 : ".
            "1 - ((z - $opt{'robust_cutoff'}[0])/($opt{'robust_cutoff'}[1] - $opt{'robust_cutoff'}[0]))); " .
         "t = A[0] * m",
         $i, $iavg,  $opt{'sdfile'},
         '-outfile', 'm', "$opt{'tmpdir'}/$bname.m.mnc",
         '-outfile', 't', "$opt{'tmpdir'}/$bname.t.mnc");

      push(@add_preq, "RZ$$-$i");
      push(@numer_files, "$opt{'tmpdir'}/$bname.t.mnc");
      push(@denom_files, "$opt{'tmpdir'}/$bname.m.mnc");
      }

   &do_cmd_batch("NUMER$$", join(",", @add_preq),
      'mincmath', '-clobber',
      '-double',
      '-max_buffer_size_in_kb', $opt{'max_buffer_size_in_kb'},
      '-add', @numer_files, "$opt{'tmpdir'}/numer.mnc");

   &do_cmd_batch("DENOM$$", join(",", @add_preq),
      'mincmath', '-clobber',
      '-double',
      '-max_buffer_size_in_kb', $opt{'max_buffer_size_in_kb'},
      '-add', @denom_files, "$opt{'tmpdir'}/denom.mnc");

   my $last_jid = &do_cmd_batch("AVG$$", "DENOM$$,NUMER$$",
      'minccalc', '-clobber',
      $opt{'output_type'},
      '-max_buffer_size_in_kb', $opt{'max_buffer_size_in_kb'},
      '-expression', '(A[1] < 0.001) ? 0 : A[0]/A[1]',
      "$opt{'tmpdir'}/numer.mnc", "$opt{'tmpdir'}/denom.mnc",
      $avgfile);

   if(defined($opt{'batch_lastjobid'})){
      open(FH, ">$opt{'batch_lastjid'}");
      print FH "$last_jid\n";
      close(FH);
      }
   }



sub do_cmd {
   print STDOUT "@_\n" if $opt{'verbose'};
   if(!$opt{'fake'}){
      system(@_) == 0 or die;
   }
}

# run a command via batch
# 1st param: Job Name
# 2nd param: Depends string
# remainder: command
# returns Job ID if in batch mode
my(%jids);
sub do_cmd_batch {
   my(@args, $name, $depends, $depends_str, $logdir, $buf, $jid);
   $name = shift(@_);
   $depends = shift(@_);

   $logdir = "$opt{'tmpdir'}/log";
   &do_cmd('mkdir', '-p', $logdir) if (!-e $logdir);
   $jid = 0;

   print STDOUT "[$name:$depends] - @_\n" if $opt{'verbose'};
   if(!$opt{'fake'}){

      if($opt{'batch'}){
         print '   [B] ';
         &do_cmd('mkdir', '-p', "$logdir/$$");

         # gather dependencies
         my @ds = ();
         foreach (split/\,/, $depends){
            if(m/\*/){
               my($regex) = $_;
               $regex =~ s/\*/\.\*/g;

               # search for jids with wildcard
               foreach (sort(keys(%jids))){
                  if(m/$regex/){
                     push(@ds, $jids{$_});
                     }
                  }
               }
            elsif($_ eq 'none'){
               # nothing
               }
            else{
               # add the JID if it exists
               push(@ds, $jids{$_}) if defined($jids{$_});
               }
            }

         # generate and submit the script
         @args = ('qbatch',
            '--jid',
            '--queue', 'all.q',
            '--script', "$logdir/$$/$name.sh",
            '--logfile', "$logdir/$$/$name.log",
            '--name', $name,
            (($#ds > -1) ? ('--depends', join(',', @ds)) : ()),
            '--',
            @_);
         print join(' ', @args) . "\n" if $opt{'verbose'};

         # submit it
         use IPC::Open2;
         my $pid = open2(\*QBATCH_OUT, undef, @args);

         # close(QBATCH_IN);
         waitpid($pid, 0);
         while (<QBATCH_OUT>){
            chomp;
            $jid = $_;
            }

         print STDOUT "JID: $jid -- $name - $depends - " . join(',', @ds) . "\n";

         $jids{$name} = $jid;

         return($jid);
         }
      else{
         &do_cmd(@_);
         return($jid);
         }
      }
   }

sub print_version_info {
   print STDOUT "\n$PACKAGE version $VERSION\n".
                "Comments to $PACKAGE_BUGREPORT\n\n";
   exit;
   }



$Help = <<HELP;
| $me will average 1000's of minc files
|
| Problems or comments should be sent to: a.janke\@gmail.com
HELP



__END__

=head1 NAME

B<mincbigaverage> - averages 1000's of MINC files in linear time.

=head1 SYNOPSIS

B<mincbigaverage> [options] infile1.mnc [infile2.mnc [..]] avg.mnc

mincbigaverage is designed to discretise the problem of averaging either
a large number of input files or averaging a smaller number of large
files. (>1GB each). There is also some code included to perform "robust"
averaging in which only the most common features are kept via down-weighting
outliers beyond a standard deviation.

   $ mincbigaverage --verbose --robust \
       in1.mnc in2.mnc in3.mnc in4.mnc avg.mnc

=head1 DESCRIPTION

B<mincbigaverage> is to get around issues of the number of possible open
files in HDF/netCDF. In short if you have more than 100 files open at once
while averaging things will slow down significantly.

mincbigaverage does this via a iterative approach to averaging files and
is a direct drop in replacement for mincaverage. That said not all the
arguments of mincaverage are supported in mincbigaverage but they should
be. (in time).

Problems or comments should be sent to: a.janke@gmail.com

=head1 OPTIONS

=over 4

=item B<-v>, B<--verbose>

Be noisy when doing things

=item B<--version>

Print version number and exit

=item B<-h>, B<--help>

Dump some quick help output

=item B<--man>

Dump a man page

=item B<-f>, B<--fake>

Don't run anything, just echo the commands that would have been run

=item B<-t>, B<--tmpdir>

Define a tmpdir, should be used with --robust as vast amount of data will
be produced for a large number of input files

=item B<-b>, B<--batch>

Run using a batch system for parallel processing (qbatch).

=item B<--batch_lastjobid>

Filename to write out the job ID of the last of a mincaverage run.
This is usefull if running mincbigaverage within another script and you
need to wait for completion of mincbigaverage.

=item, B<--byte>

Output files with byte precision

=item, B<--short>

Output files with short precision

=item, B<--long>

Output files with long precision

=item, B<--float>

Output files with float precision

=item, B<--double>

Output files with double precision

=item B<--sdfile>

Place standard deviation image in specified file

=item B<--avgnum>

Number of input files to average at a time (Default: 20)

=item B<--filelist>

Input file with the files to be averaged one per line

=item B<--robust>

Perform robust averaging, features that are outside 1 standard deviation
from the mean are downweighted. Works well for noisy data with artifacts.
see the --tmpdir option if you have a large number of input files.

=item B<--robust_cutoff>

Requires two floating point numbers following the argument. This defines
the cutoff points for robust averaging. The default of --robust_cutoff 1.0 1.2
means that any data more than 1.0 standard deviation from the average will be
linearly downweighted until 1.2 standard deviations from the mean at which point
the weighting will be zero.

=back

=head1 SEE ALSO

mincaverage(1) minccalc(1)

=head1 AUTHOR

Andrew Janke - a.janke@gmail.com

=cut
