#!/usr/local/bin/perl
#
# Copyright 2005-2009 Genes to Cognition Programme (G2C) and 
# Genome Research Limited (GRL)
#
# Contact: webmaster@genes2cognition.org
# See    : www.genes2cognition.org/software/southern_blot
#
# You may distribute this file/module under the terms of the Perl artistic
# licence: http://www.perlfoundation.org/artistic_license_2_0

=pod

=head1 NAME - run_probe_search

=head1 COMMAND LINE PARAMETERS

Required parameters
  --config_file
  --job_id
  
Optional parameters
  --debug
  --short_run        (default is off) Used for testing purposes. Restricts
                     the maximum number of sequences to be searched against
                     the genome

=head1 CONFIGURATION FILE

See run_probe_search.ini for more configuration details.

=head1 DESCRIPTION

Runner script to execute an exonerate job specified by job_id as part
of the Southern blot probe design process.

Note one does not normally run this script directly, instead instances are
started following the running of submit_probe_search which submits
such jobs under the control of LSF.

=head1 CONTACT

G2C B<email> webmaster@genes2cognition.org

=cut


use strict;
use warnings;
use Bio::SeqIO;
use Carp;
use DirHandle;
use Getopt::Long;
use GeneTargetingDB;
use GeneTargeting::Utils qw (get_temp_dir get_temp_filename
    check_programme_is_in_path print_hash_ref);
use GeneTargeting::Utils::Config;
use GeneTargeting::Utils::Exonerate;

my $debug;

{
    my ( $config_file, $job_id, $short_run );    
    GetOptions(
        "job_id=i"          => \$job_id,
        "config_file=s"     => \$config_file,
        "short_run=i"       => \$short_run,
        "debug"             => \$debug
    );
    
    #Check parameters
    confess "Must set --job_id\n" unless $job_id;
    confess "Must set --config_file\n" unless $config_file;    

    my $cfg = GeneTargeting::Utils::Config->new($config_file, $debug);
    my ($dba) = $cfg->make_analysis_DBAdaptor_from_config();

    #Get the adaptors we need.
    my $job_aptr    = $dba->get_JobAdaptor()
        or confess "Could not get_JobAdaptor";
    my $conf_aptr   = $dba->get_ConfAdaptor()
        or confess "Could not get ConfAdaptor";
    my $seq_aptr    = $dba->get_SequenceAdaptor()
        or confess "Could not get SequenceAdaptor";
    my $ext_db_aptr = $dba->get_ExternalDBAdaptor()
        or confess "Could not get_ExternalDBAdaptor";
    
    #Check exonerate is in the path, set temp files, adaptors
    print "$0\n";
    print "binary is: ", check_programme_is_in_path('exonerate'), "\n";
    set_files();
    set_adaptors($dba);
    
    #Retrieve the Job
    my $job = $job_aptr->fetch_by_db_id($job_id);
    confess "Could not retrieve Job id=$job_id\n" unless $job;
    print "Retrieved Job, id = $job_id\n";
    
    #Check the Job state
    unless ($job->state eq 'submitted') {
       confess "Error: For Job id = $job_id, state = '" . $job->state
            . "' (should be 'submitted')\n";
    }
 
    #Set the job and job adaptor, from now write errors to database
    set_job_and_adaptor($job, $job_aptr);
 
    #Retrieve the relevent Conf
    my $conf = $conf_aptr->fetch_by_db_id($job->analysis_conf_id);
    unless ($conf) {
        confess_and_write_error("Failed to retrieve conf");
    }
    unless ($conf->isa('GeneTargeting::DBEntry::Conf::Exonerate')) {
        confess_and_write_error(
            "Conf must be a 'GeneTargeting::DBEntry::Conf::Exonerate'");
    }
    print "Retrieved Conf, name = ", $conf->name, "\n";

    #Update job state
    $job->state('running');
    $job_aptr->update($job);
    
    my $seqs_searched  = do_exonerate_run($dba, $job, $conf, $short_run);

    #Check there was at least one hit to one sequence
    unless(sequence_hit_count_total() > 0) {
        confess_and_write_error("No sequences produced hits");
    }
    my $seqs_with_hits = print_sequence_hit_counts();
    unless ($seqs_with_hits == $seqs_searched) {
        confess_and_write_error("Only $seqs_with_hits of $seqs_searched gave hits");
    }
    $job->state('success');
    $job_aptr->update($job);

    END {

        cleanup_temp_files();
        $dba->DESTROY if $dba;
    }
}

sub do_exonerate_run {
    my ( $dba, $job, $conf, $short_run ) = @_;

    my $job_aptr  = $dba->get_JobAdaptor();
    my $seq_aptr  = $dba->get_SequenceAdaptor();
    
    my $seq_ids;
    if ($seq_ids = $job_aptr->get_Sequence_ids($job)) {
        print "Fetched ", scalar(@$seq_ids), " Sequence ids";
    } else {
        confess_and_write_error("Got no Sequence ids"); 
    }
    print " (--short run set to: $short_run)" if $short_run;
    print "\n\n";
    
    #Find the files to search against
    my ($fasta_dir, $fasta_files) = get_fasta_dir_files($conf);
    unless ($conf->query_batch_size) {
        confess_and_write_error("query_batch_size not set");
    }
        
    my $seqs_searched = 0;
    my $batch_total = 0;
    my @seqs;
    foreach my $seq_id (@$seq_ids) {

        my $seq = $seq_aptr->fetch_by_db_id($seq_id)
            or confess_and_write_error("Failed to fetch Sequence by id: '$seq_id'");
        if ($batch_total < $conf->query_batch_size) {
            push(@seqs, $seq);
            $batch_total++;
        } else {
            exonerate_Seqs($conf, $fasta_dir, $fasta_files, \@seqs);
            $seqs_searched += $conf->query_batch_size;
            @seqs = ($seq);
            $batch_total = 1
        }
        print STDERR '.' if $debug;
        if ($short_run and $seqs_searched >= $short_run) {
            last;
        }
    }
    if (@seqs) {
        exonerate_Seqs($conf, $fasta_dir, $fasta_files, \@seqs);
        $seqs_searched += scalar(@seqs);
    }
    print STDERR "\n" if $debug;
    print "$seqs_searched query seqs were searched with exonerate\n";
    
    return $seqs_searched;
}

{
    my ($dba, $hit_aptr, $sequence_hit_aptr);
    my (%sequence_comp_hit_counts);

    sub set_adaptors {
        ( $dba ) = @_;
        
        $hit_aptr = $dba->get_HitAdaptor()
            or confess "Could not get HitAdaptor";
        $sequence_hit_aptr = $dba->get_SequenceHitAdaptor()
            or confess "Could not get SequenceHitAdaptor";
    }

    sub store_exonerate_result {
        my ( $conf, $result ) = @_;

        my $accession = $result->{target_id};
        my $ext_db_id = $conf->ext_db_id;

        my $hit;
        unless ($hit = $hit_aptr->fetch_by_accession_and_ext_db_id(
            $accession, $ext_db_id)) {

            $hit = GeneTargeting::DBEntry::Hit->new;
            $hit->accession($result->{target_id});
            $hit->ext_db_id($conf->ext_db_id);
            $hit->seq_length($result->{target_length});
            $hit->description('genomic DNA');
            $hit_aptr->store($hit);
        }

        my $component_hit = GeneTargeting::DBEntry::ComponentHit->new;
        $component_hit->query_id($result->{query_id});
        $component_hit->hit_id($hit->id);
        $component_hit->conf_id($conf->id);
        $component_hit->score($result->{score});
        $component_hit->query_start($result->{query_start});
        $component_hit->query_end($result->{query_end});
        $component_hit->query_strand($result->{query_strand});
        $component_hit->subject_start($result->{target_start});
        $component_hit->subject_end($result->{target_end});
        $component_hit->subject_strand($result->{target_strand});
        $component_hit->percent_id($result->{percent_id});
        $component_hit->soft_masked($result->{soft_masked});
        $sequence_hit_aptr->store($component_hit);
        
        $sequence_comp_hit_counts{$result->{query_id}}++;
        return 1;
    }
    
    sub print_sequence_hit_counts {

        print "Total sequences with hits: "
            , scalar(keys(%sequence_comp_hit_counts)), "\n\n";
        print "Sequence id   SequenceHit Count\n";
        print "-----------   -----------------\n\n";
        foreach my $sequence_id (sort {$a <=> $b} (keys(%sequence_comp_hit_counts))) {
            print sprintf("%8s", $sequence_id), ' ' x 13
                , $sequence_comp_hit_counts{$sequence_id}, "\n";
        }
        print "\n";
        
        return scalar(keys(%sequence_comp_hit_counts));
    }
    
    sub sequence_hit_count_total {

        my $total = 0;
        foreach my $sequence_id (keys(%sequence_comp_hit_counts)) {
        
            $total += $sequence_comp_hit_counts{$sequence_id};
        }
        return $total;
    }
}

{   
    my ($input_file, $output_file);
    
    #Can pass a reference to an array of GeneTargeting::DBEntry::Sequence
    #objects or a single object of that class
    
    sub exonerate_Seqs {
        my ( $conf, $target_dir, $target_files, $seq_s ) = @_;
        
        {
            my $seqIO = Bio::SeqIO->new(-file   => ">$input_file",
                                        -format => 'FASTA')
                or confess_and_write_error("Could not create file '$input_file' : $!");

            my $seqs;
            if (ref($seq_s) =~ /ARRAY/) {
                $seqs = $seq_s;
            } else {
                $seqs = [$seq_s];
            }

            foreach my $seq (@$seq_s) {
                my $bio_seq = Bio::Seq->new;
                $bio_seq->id($seq->id);
                $bio_seq->seq($seq->dna);
                $seqIO->write_seq($bio_seq);
            }
        }
        
        foreach my $target_file (@$target_files) {
            
            my $search_file = $target_dir . '/' . $target_file;
            my $runner_string = "exonerate --ryo '%S pArSe %pi %tl"
                . q/\n' --showvulgar false /;
            if ($conf->args) {
                my $args = $conf->args;
                $args =~ s/^\s+//;
                $args =~ s/\s$//;

                $runner_string .= "$args ";
            }
            $runner_string .= q/--query / . $input_file
                . " --target $search_file > $output_file";
            system $runner_string;

            my $fh = parse_exonerate_file($output_file)
                or confess_and_write_error("Could not open exonerate file '$output_file': $!");

            while (my $result = get_next_exonerate_result($fh)) {

                store_exonerate_result($conf, $result);
                print_hash_ref($result) if $debug;
            }
        }        
    }
    
    sub set_files {
        
        my $tmp_dir = get_temp_dir();
        
        $input_file  = $tmp_dir . '/' . get_temp_filename();
        $output_file = $tmp_dir . '/' . get_temp_filename();       
    }

    sub cleanup_temp_files {

        if ($input_file and -f $input_file) {
            unlink $input_file;
            print STDERR "Deleted: $input_file\n" if $debug;
        }        

        if ($output_file and -f $output_file) {
            unlink $output_file;
            print STDERR "Deleted: $output_file\n" if $debug;
        }        
    }
}

sub get_fasta_dir_files {
    my ( $conf ) = @_;

    my $fasta_dir;
    unless ($fasta_dir = $conf->fasta_dir) {
        confess_and_write_error("fasta_dir not set in Conf");
    }
    unless (-d $fasta_dir) {
        confess_and_write_error("Can't read fasta_dir");
    }
    
    #Lets try to get the files
    print "Checking for *.fa files in: $fasta_dir\n";
    my $dh = DirHandle->new($fasta_dir)
        or confess_and_write_error("Can't read $fasta_dir : $!");
    my @files = sort(grep { /\.fa$/ } $dh->read());
    unless (@files) {
        confess_and_write_error("Failed to find any files ending *.fa");
    }
    
    print "Found ", scalar(@files), " files:\n  ";
    foreach my $file (@files) {
        print "$file";
        print ", " unless $file eq $files[$#files];
    }  
    print "\n\n";

    return ($fasta_dir, \@files);    
}

{
    my ($job_aptr, $job);
    sub set_job_and_adaptor {
        ( $job, $job_aptr ) = @_;
    
    }
    
    sub confess_and_write_error {
        my ( $error ) = @_;

        $job_aptr->store_Job_Error($job, $error);
        $job->state('failed');
        $job_aptr->update($job);
        confess $error;
    }
}

sub configure_parameters {

    my $mandatory = {};
    my $optional  = {};

    $mandatory->{'analysis_database'} = ['db', 'host', 'user'];
     $optional->{'analysis_database'} = ['pass', 'port'];
    
    return ($mandatory, $optional);
}
