#!/usr/bin/perl -w # $Id:$ # # NAME # crossval - Run Aleph on a collection of cross validation sets. # # SYNOPSIS # crossval [--aleph ALEPH] [--yap YAP] [--param ALEPH_SETTING] [--set ALEPH_SETTING] # EXP_BASE BACKGROUND DATA_PATTERN # [SETTINGS_FILE] [COMMAND_FILE] # SETUP # These are the requirements (or at least what I have used to get this # script to work): # Perl V5.6.0 or later and the Getopt::Long and File::Basename modules # These modules are pretty standard so you probably don't have to worry # about them if you have Perl installed. # # Aleph Version 3 with Stasinos Konstantopoulos's [konstant@let.rug.nl] # cross-validation patch and my write_rules patch. # # Yap 4.3.19 or higher (requires the yap -L option) # # USAGE # Runs a series of Aleph sessions, with input and output is stored in files # beginning with EXP_BASE. The background file for the task is given by # BACKGROUND (without the '.b' extension) and the examples are contained in # any file matching the regular expression in DATA_PATTERN and ending with # '.f' or '.n'. For example, # crossval experiments/exp1/test ../data/mutagenesis ../data/folds/s.* # Will run Aleph using '../data/mutagenesis.b' as the background file, # and the cross-validation set in '../data/folds/' that matches the pattern # 's.*\.[fn]'. # # That is, if the '../data/folds/' directory contains these files: # split1.f, split1.n, split2.f, split2.n, ... , split10.f, split10.n # Then there will be 10 Aleph sessions run, each with a different splitN.f # and splitN.n file held out. The results of each run will be found in the # 'experiments/exp1/' directory named # 'test1.out', ..., 'test10.out'. # Any of these experiments can be rerun later by executing the files # 'test1.in', ..., 'test10.in'. # # The Aleph sessions are run by building a Yap script that looks like this: # #!/wherever/you/put/yap -L # ... some initialization commands ... # # :- consult('the aleph.pl file'). # :- read_all('background', ['example base 1', ..., 'example base k']). # # % SETTINGS # :- set(rulesfile, 'exp. base.rules'). # ... settings from SETTING_FILE if given ... # ... settings from --set options ... # ... settigns from --param options ... # # % COMMANDS # ... commands from the COMMANDS_FILE or the commands below ... # :- induce. # :- write_rules. # # OPTIONS # -a, --aleph ALEPH # ALEPH is the location of the file containing Aleph. # This overwrites the value stored in the ALEPH environment variable. # -y , --yap YAP # YAP is the location of the Yap executable. # This overwrites the value stored in the YAP environment variable. # --set ALEPH_SETTING # ALEPH_SETTING is of the form setting=value. This is turned into # a "set(setting,value)" command which is run after Aleph is loaded. # Several settings can be made by repeating this option, eg: # --set nodes=500 --set clauselength=3 # These will appear after (and hence overwrite) any settings in loaded # background files or the SETTINGS_FILE. # --param ALEPH_SETTING # Exactly the same as --set except that the parameter is add to the # run's filename so it can be processed later with makeplot. # These overwrite any settings made by --set. # # ENVIRONMENT VARIABLES # ALEPH # If set, this variable's value is used as the location of the # 'aleph.pl' file. # YAP # If set, this variable's value is used as the location of the # Yap executable. # # CYGWIN_ROOT # If you are using Cygwin, you must set this variable to the base # of your Cygwin installation (eg, mine is C:/cygwin). This is # required as Yap for Cygwin does not use the Cygwin .dll to resolve # filenames. # # If you are running on Unix or Linux make sure this is not set. # # Note: The location of the 'aleph.pl' file and the Yap executable must either # be set through these variables or given through the command line options above. # # AUTHOR # Mark Reid # mreid@cse.unsw.edu.au # # CHANGELOG # 29th Sept 2002 # Fixed empty :- lines added when using command and settings files. # Single --param was not placed into filename. # 2nd Oct 2002 # Added --debug option. use strict; use Getopt::Long; use File::Basename; # Usage string my $SYNOPSIS = "crossval [--aleph ALEPH] [--yap YAP] [--param ALEPH_SETTING] [--set ALEPH_SETTING]" ."\n EXP_BASE BACKGROUND DATA" ."\n [SETTINGS_FILE] [COMMAND_FILE]"; # Yap under Cygwin does not understand absolute Cygwin paths, so a prefix # is grabbed from the environment to say how to convert /some/path to # C:/cygwin/some/path. my $CYGWIN_ROOT = defined $ENV{'CYGROOT'} ? $ENV{'CYGROOT'} : ""; # This stores all the Aleph settings my %settings; my %params; my $PARALLEL = 0; my $ALEPH = (defined $ENV{'ALEPH'} ? $ENV{'ALEPH'} : ""); my $YAP = (defined $ENV{'YAP'} ? $ENV{'YAP'} : ""); my $DEBUG = 0; GetOptions ( 'param:s' => \%params, 'set:s' => \%settings, 'parallel' => \$PARALLEL, 'aleph:s' => \$ALEPH, 'yap:s' => \$YAP, 'debug' => \$DEBUG ); # We must know where Aleph and Yap are ($ALEPH && $YAP) || die "Cannot run crossval without ALEPH and YAP!\n"; my ($experiment, $background, $data, $settings, $commands) = @ARGV; # EXP_BASE, BACKGROUND and DATA_PATTERN are required if(! $experiment || ! $background || ! $data) { die "USAGE: $SYNOPSIS\n"; } # Print header print "--------------------------------------\n"; print "Experiment started at: ".localtime()."\n"; print "Yap: $YAP\nAleph: $ALEPH\n"; # Build a list of data basenames from the pattern plus .f or .n extensions $DEBUG && print "DEBUG -- File Base: '$data'.\n"; my @datafiles = @{getFiles($data.'\.[fn]')}; $DEBUG && print "DEBUG -- Files:\n".join("\n\t",@datafiles)."\n"; my @databases; foreach my $file (@datafiles) { my ($base) = $file =~ /(.*)\.[f]/; if(defined($base)) { # If the file base has an absolute position, prepend it with the cygwin root. if($base =~ /^\//) { $base = $CYGWIN_ROOT.$base; } push @databases, $base; } } my @exp_params; # Parameters are settings that are recorded in the filename my @exp_settings; # Settings are not recorded in the filename. my @exp_commands; # These are the commands to be run after the settings and params. # Add settings from settings file. These will override any settings in the # $background file. if($settings) { open(SETTINGS, "<$settings") || die "Could not read settings from '$settings'"; print "Settings File: '$settings'.\n"; push @exp_settings, "% Settings: $settings"; while() { chomp; $_ && push @exp_settings, $_; } } # This is the same as below but the settings are not saved in the filename. foreach my $param (keys %settings) { my $paramval = $settings{$param}; print "Using setting '$param' with value '$paramval'\n"; push @exp_settings, "set($param,$paramval)."; } # Add parameters given on the command line. These override settings given in the # $background file and in the $settings file. Parameters are just settings that # are stored in the file name for processing by makeplot. my @baseexts = (); foreach my $param (keys %params) { my $paramval = $params{$param}; print "Using parameter '$param' with value '$paramval'\n"; push @exp_settings, "set($param,$paramval)."; push @baseexts, $param."_".$paramval; } # Add any parameters to the experiments base name. my $expbase = ($#baseexts > -1 ? $experiment."--".join("-", @baseexts)."--" : $experiment); # If a commands file was specified, add it to the list of commands, otherwise # just add "induce.". Always show the settings at the start of the run once all the # settings have been made. push @exp_commands, "show(settings)."; if($commands) { open(COMMANDS, "<$commands") || die "Could not open commands file: '$commands'"; print "Commands File: '$commands'.\n"; push @exp_commands, "% From commands file: $commands"; while() { chomp; $_ && push @exp_commands, $_; } } else { push @exp_commands, "induce."; push @exp_commands, "write_rules."; } # Run all of the cross validations print basename($expbase).":\n"; my %runs = %{buildruns(\@databases)}; my $runnum = 0; foreach my $holdout (keys %runs) { $runnum++; my $runname = $expbase.$runnum; my $outfile = $runname.".out"; my $rulesfile = $runname.".rules"; my @run = @{$runs{$holdout}}; # These need to be run to ensure the enviroment is okay and all the files # are loaded in. my $pwd = $CYGWIN_ROOT.$ENV{'PWD'}; my @run_settings = ( "path(_P).", # This is here to get around weird path bug "add_to_path('$pwd').", # So we can consult $background "consult('$ALEPH').", "cd('$pwd').", # So we can read in the example files "read_all('$background', [".join(',', map("'$_'", @run))."]).", "set(record, true).", "set(recordfile, '$outfile').", "set(test_pos, '$holdout.f').", "set(test_neg, '$holdout.n').", "set(experiment, '$runname').", "set(rulefile, '$rulesfile')." ); # Write the commands to an .in file in this order: # run_settings, exp_settings, exp_commands my $infilename = $runname.".in"; open(COMMANDS, ">$infilename") || die "Could not open '$infilename' for writing!\n"; print COMMANDS "#!$YAP -L \n"; print COMMANDS "# BUILT: ".localtime()."\n"; my @commands = (); foreach my $command (@run_settings, @exp_settings, @exp_commands) { # Ignore comments if (!($command =~ /^%/)) { push @commands, ':- '.$command; } else { push @commands, $command; } } print COMMANDS join("\n", @commands)."\n"; close(COMMANDS); # Delete any old .out files that may be lying around so record doesn't append to it. -f $outfile && system("rm $outfile"); # Run the experiment my $time = time(); print "\tRun ".$runnum."..."; my $command = ($PARALLEL ? "enqueue " : "").$infilename.' > /dev/null 2>&1'; system("chmod +x $infilename"); system($command); print $PARALLEL ? "enqueued." : "completed!"; $time = time() - $time; print " (Time: $time s)\n"; } print "Experiment completed at: ".localtime()."\n"; # getFiles # Read in all the files matching the given pattern. sub getFiles { my ($filepattern) = @_; my $dir = dirname($filepattern); my $file = basename($filepattern); opendir(DIR, $dir) || die "Could not open directory '$dir': $!\n"; my $pattern = qr/$file/; my @files = grep { /^$pattern/ && -f "$dir/$_" } readdir(DIR); @files = map { $dir."/".$_ } @files; closedir DIR; return \@files; } # buildruns # Creates a hash of basenames to lists where the lists are a collection of datasets # to be used in a cross validation run. The basename for each list is for file that # was held out. sub buildruns { my ($dbref) = @_; my @dbs = @{$dbref}; my %runs; foreach my $holdout (@dbs) { # Training files are all those files that don't match the holdout file my @currdbs = grep (!/^$holdout$/, @dbs); $runs{$holdout} = \@currdbs; } return \%runs; }