#!/usr/bin/perl -w
# $Id:$
#
# NAME
# 		crossval - Run Aleph on a collection of cross validation sets.
#
# SYNOPSIS
#		crossval [--aleph ALEPH] [--yap YAP] [--param ALEPH_SETTING] [--set ALEPH_SETTING]
#				 EXP_BASE BACKGROUND DATA_PATTERN
#				 [SETTINGS_FILE] [COMMAND_FILE]
# SETUP
#		These are the requirements (or at least what I have used to get this
#		script to work):
#			Perl V5.6.0 or later and the Getopt::Long and File::Basename modules
#			These modules are pretty standard so you probably don't have to worry
#			about them if you have Perl installed.
#
#			Aleph Version 3 with Stasinos Konstantopoulos's [konstant@let.rug.nl]
#			cross-validation patch and my write_rules patch.
#
#			Yap 4.3.19 or higher (requires the yap -L option)
#
# USAGE
#		Runs a series of Aleph sessions, with input and output is stored in files
#		beginning with EXP_BASE. The background file for the task is given by
#		BACKGROUND (without the '.b' extension) and the examples are contained in
#		any file matching the regular expression in DATA_PATTERN and ending with
#		'.f' or '.n'. For example,
#			crossval experiments/exp1/test ../data/mutagenesis ../data/folds/s.*
#		Will run Aleph using '../data/mutagenesis.b' as the background file,
#		and the cross-validation set in '../data/folds/' that matches the pattern 
#		's.*\.[fn]'.
#
#		That is, if the '../data/folds/' directory contains these files:
#			split1.f, split1.n, split2.f, split2.n, ... , split10.f, split10.n
#		Then there will be 10 Aleph sessions run, each with a different splitN.f
#		and splitN.n file held out. The results of each run will be found in the 
#		'experiments/exp1/' directory named
#			'test1.out', ..., 'test10.out'.
#		Any of these experiments can be rerun later by executing the files 
#			'test1.in', ..., 'test10.in'.
#			
#		The Aleph sessions are run by building a Yap script that looks like this:
#			#!/wherever/you/put/yap -L
#			... some initialization commands ...
#
#			:- consult('the aleph.pl file').
#			:- read_all('background', ['example base 1', ..., 'example base k']).
#			
#			% SETTINGS
#			:- set(rulesfile, 'exp. base.rules').
#			... settings from SETTING_FILE if given ...
#			... settings from --set options ...
#			... settigns from --param options ...
#
#			% COMMANDS
#			... commands from the COMMANDS_FILE or the commands below ...
#			:- induce.
#			:- write_rules.
#
# OPTIONS
#		-a, --aleph ALEPH
#			ALEPH is the location of the file containing Aleph.
#			This overwrites the value stored in the ALEPH environment variable.
#		-y , --yap YAP
#			YAP is the location of the Yap executable.
#			This overwrites the value stored in the YAP environment variable.
#		--set ALEPH_SETTING
#			ALEPH_SETTING is of the form setting=value. This is turned into
#			a "set(setting,value)" command which is run after Aleph is loaded.
#			Several settings can be made by repeating this option, eg:
#				--set nodes=500 --set clauselength=3
#			These will appear after (and hence overwrite) any settings in loaded
#			background files or the SETTINGS_FILE.
#		--param ALEPH_SETTING
#			Exactly the same as --set except that the parameter is add to the
#			run's filename so it can be processed later with makeplot.
#			These overwrite any settings made by --set.
#
# ENVIRONMENT VARIABLES
#		ALEPH
#			If set, this variable's value is used as the location of the 
#			'aleph.pl' file.
#		YAP
#			If set, this variable's value is used as the location of the 
#			Yap executable.
#			
#		CYGWIN_ROOT
#			If you are using Cygwin, you must set this variable to the base
#			of your Cygwin installation (eg, mine is C:/cygwin). This is
#			required as Yap for Cygwin does not use the Cygwin .dll to resolve
#			filenames.
#
#			If you are running on Unix or Linux make sure this is not set.
#
#	Note: The location of the 'aleph.pl' file and the Yap executable must either
#	be set through these variables or given through the command line options above.
#			
# AUTHOR
# 	Mark Reid
#	mreid@cse.unsw.edu.au
#
# CHANGELOG
#	29th Sept 2002
#		Fixed empty :- lines added when using command and settings files.
#		Single --param was not placed into filename.
#   2nd Oct 2002
#		Added --debug option.
use strict;
use Getopt::Long;
use File::Basename;

# Usage string
my $SYNOPSIS 
  = "crossval [--aleph ALEPH] [--yap YAP] [--param ALEPH_SETTING] [--set ALEPH_SETTING]"
   ."\n         EXP_BASE BACKGROUND DATA"
   ."\n         [SETTINGS_FILE] [COMMAND_FILE]";

# Yap under Cygwin does not understand absolute Cygwin paths, so a prefix
# is grabbed from the environment to say how to convert /some/path to
# C:/cygwin/some/path.
my $CYGWIN_ROOT = defined $ENV{'CYGROOT'} ? $ENV{'CYGROOT'} : "";

# This stores all the Aleph settings
my %settings;
my %params;
my $PARALLEL = 0;
my $ALEPH = (defined $ENV{'ALEPH'} ? $ENV{'ALEPH'} : "");
my $YAP = (defined $ENV{'YAP'} ? $ENV{'YAP'} : "");
my $DEBUG = 0;

GetOptions
  (
   'param:s'	=> \%params,
   'set:s'		=> \%settings,
   'parallel'	=> \$PARALLEL,
   'aleph:s'	=> \$ALEPH,
   'yap:s'		=> \$YAP,
   'debug'		=> \$DEBUG
  );

# We must know where Aleph and Yap are
($ALEPH && $YAP) || die "Cannot run crossval without ALEPH and YAP!\n";

my ($experiment, $background, $data, $settings, $commands) = @ARGV;

# EXP_BASE, BACKGROUND and DATA_PATTERN are required
if(! $experiment || ! $background || ! $data) {
  die "USAGE: $SYNOPSIS\n";
}

# Print header
print "--------------------------------------\n";
print "Experiment started at: ".localtime()."\n";
print "Yap: $YAP\nAleph: $ALEPH\n";

# Build a list of data basenames from the pattern plus .f or .n extensions
$DEBUG && print "DEBUG -- File Base: '$data'.\n";
my @datafiles = @{getFiles($data.'\.[fn]')};
$DEBUG && print "DEBUG -- Files:\n".join("\n\t",@datafiles)."\n";

my @databases;
foreach my $file (@datafiles) {
  my ($base) = $file =~ /(.*)\.[f]/;
  if(defined($base)) {
	# If the file base has an absolute position, prepend it with the cygwin root.
	if($base =~ /^\//) { $base = $CYGWIN_ROOT.$base; }
	push @databases, $base;
  }
}

my @exp_params;		# Parameters are settings that are recorded in the filename
my @exp_settings;	# Settings are not recorded in the filename.
my @exp_commands;	# These are the commands to be run after the settings and params.

# Add settings from settings file. These will override any settings in the
# $background file.
if($settings) {
  open(SETTINGS, "<$settings") || die "Could not read settings from '$settings'";
  print "Settings File: '$settings'.\n";
  push @exp_settings, "% Settings: $settings";
  while(<SETTINGS>) {
	chomp;
	$_ && push @exp_settings, $_;
  }
}

# This is the same as below but the settings are not saved in the filename.
foreach my $param (keys %settings) {
	my $paramval = $settings{$param};

	print "Using setting '$param' with value '$paramval'\n";
	push @exp_settings, "set($param,$paramval).";
}

# Add parameters given on the command line. These override settings given in the
# $background file and in the $settings file. Parameters are just settings that
# are stored in the file name for processing by makeplot.
my @baseexts = ();
foreach my $param (keys %params) {
	my $paramval = $params{$param};

	print "Using parameter '$param' with value '$paramval'\n";
	push @exp_settings, "set($param,$paramval).";
	push @baseexts, $param."_".$paramval;
}

# Add any parameters to the experiments base name.
my $expbase = ($#baseexts > -1 ? $experiment."--".join("-", @baseexts)."--" : $experiment);

# If a commands file was specified, add it to the list of commands, otherwise
# just add "induce.". Always show the settings at the start of the run once all the
# settings have been made.
push @exp_commands, "show(settings).";
if($commands) {
  open(COMMANDS, "<$commands") || die "Could not open commands file: '$commands'";
  print "Commands File: '$commands'.\n";
  push @exp_commands, "% From commands file: $commands";
  while(<COMMANDS>) {
	chomp;
	$_ && push @exp_commands, $_;
  }
} else {
  push @exp_commands, "induce.";
  push @exp_commands, "write_rules.";
}


# Run all of the cross validations
print basename($expbase).":\n";
my %runs = %{buildruns(\@databases)};
my $runnum = 0;
foreach my $holdout (keys %runs) {
  $runnum++;
  my $runname = $expbase.$runnum;
  my $outfile = $runname.".out";
  my $rulesfile = $runname.".rules";
  my @run = @{$runs{$holdout}};

  # These need to be run to ensure the enviroment is okay and all the files
  # are loaded in.
  my $pwd = $CYGWIN_ROOT.$ENV{'PWD'};
  my @run_settings =
	(
	 "path(_P).",				# This is here to get around weird path bug
	 "add_to_path('$pwd').",	# So we can consult $background
	 "consult('$ALEPH').",
	 "cd('$pwd').",				# So we can read in the example files
	 "read_all('$background', [".join(',', map("'$_'", @run))."]).",
	 "set(record, true).",
	 "set(recordfile, '$outfile').",
	 "set(test_pos, '$holdout.f').",
	 "set(test_neg, '$holdout.n').",
	 "set(experiment, '$runname').",
	 "set(rulefile, '$rulesfile')."
	);

  # Write the commands to an .in file in this order:
  # 	run_settings, exp_settings, exp_commands
  my $infilename = $runname.".in";
  open(COMMANDS, ">$infilename") || die "Could not open '$infilename' for writing!\n";
  print COMMANDS "#!$YAP -L \n";
  print COMMANDS "# BUILT: ".localtime()."\n";
  my @commands = ();
  foreach my $command (@run_settings, @exp_settings, @exp_commands) {
	# Ignore comments
	if (!($command =~ /^%/)) {
	  push @commands, ':- '.$command;
	} else {
	  push @commands, $command;
	}
  }
  print COMMANDS join("\n", @commands)."\n";
  close(COMMANDS);

  # Delete any old .out files that may be lying around so record doesn't append to it.
  -f $outfile && system("rm $outfile");

  # Run the experiment
  my $time = time();
  print "\tRun ".$runnum."...";

  my $command = ($PARALLEL ? "enqueue " : "").$infilename.' > /dev/null 2>&1';
  system("chmod +x $infilename");
  system($command);

  print $PARALLEL ? "enqueued." : "completed!";
  $time = time() - $time;
  print " (Time: $time s)\n";
}

print "Experiment completed at: ".localtime()."\n";

# getFiles
# Read in all the files matching the given pattern.
sub getFiles {
  my ($filepattern) = @_;
  my $dir = dirname($filepattern);
  my $file = basename($filepattern);

  opendir(DIR, $dir) || die "Could not open directory '$dir': $!\n";
  my $pattern = qr/$file/;
  my @files = grep { /^$pattern/ && -f "$dir/$_" } readdir(DIR);
  @files = map { $dir."/".$_ } @files;
  closedir DIR;

  return \@files;
}

# buildruns
# Creates a hash of basenames to lists where the lists are a collection of datasets
# to be used in a cross validation run. The basename for each list is for file that
# was held out.
sub buildruns {
  my ($dbref) = @_;
  my @dbs = @{$dbref};

  my %runs;

  foreach my $holdout (@dbs) {
	# Training files are all those files that don't match the holdout file
	my @currdbs = grep (!/^$holdout$/, @dbs);
	$runs{$holdout} = \@currdbs;
  }

  return \%runs;
}