#!/usr/local/bin/perl
#
# Copyright 2005-2009 Genes to Cognition Programme (G2C) and 
# Genome Research Limited (GRL)
#
# Contact: webmaster@genes2cognition.org
# See    : www.genes2cognition.org/software/southern_blot
#
# You may distribute this file/module under the terms of the Perl artistic
# licence: http://www.perlfoundation.org/artistic_license_2_0

=pod

=head1 NAME - create_probe_search

=head1 COMMAND LINE PARAMETERS

Required parameters
  --config_file         
  --probe_name          A unique name to be used for the probe
  --probe_description   
  --chromosome          } 
  --start               } Specifies the probe design window
  --end                 } on the target genomic assembly

Optional parameters
  --strand              +1|-1 (default is +1 i.e. forward with
                        respect to the genome assembly)
  --end_bias            If specified, must be '5prime' or '3prime'
  --debug               Switch on debugging
  --help|h              Output documentation

=head1 CONFIGURATION FILE

See create_probe_search_*.ini for more configuration details.

=head1 DESCRIPTION

Programme to design probe sequences for Southern blotting, by finding
(near) unique sequences in a specified region of a genome, containing
little repetitive DNA. 

The probe design process is a three step one:

1) create_probe_search
2) submit_probe_search       - which causes multiple copies
                             - of run_probe_seach
                             - to be executed on machine(s)   
3) analyse_probe_search

And optionally:

4) get_probe_search_cpu_time
5) delete_probe_search
6) delete_job_results

Using create_probe_search, the design window is specified with the
--chromosome, --start and --end parameters in which to search for good
probes using a brute force approach of performing many searches scanning
within the region, and the size range specified for acceptable probes.

The --end_bias parameter allows one to specify at which end of the window
one would prefer the probe to be located, affecting the probes selected by
running analyse_probe_search

Technically the programme creates a number of GeneTargeting::DBEntry::Jobs
for performing exonerate searches, and these can be submitted to execution
hosts using submit_probe_search

=head1 EXAMPLE RUN

Example command line:

./create_probe_search --config_file=create_probe_search_NCBI_m37.ini
--probe_name=test --probe_description=testing --chromosome=2 
--start=25209733 --end=25212733 --end_bias=5prime

Ensembl database --
Connected to host: ensembldb.ensembl.org, as user: anonymous
Database         : mus_musculus_core_47_37

GeneTargeting analysis database --
Connected to host: host, as user: user
Database       : southern_blot_design

Stored genome ExternalDB     : mouse_NCBIM37 (with id: 1)
Stored exonerate search Conf : Exonerate mouse_NCBIM37 (with id: 1)

Coordinates checkout OK with chromosome length

Designing probe in this region:
REF              : Bio::EnsEMBL::Slice
name             : chromosome:NCBIM37:2:25209733:25212733:1
seq_region_name  : 2
seq_region_length: 181748087
strand           : 1
start            : 25209733
end              : 25212733
length           : 3001

Stored as DNAProbe 'test' (with id: 1)
Design window length: 3001
Designing sequences with 5' bias in window

Probe size: 1300, step size: 130
Probe size: 1250, step size: 125
Probe size: 1200, step size: 120
Probe size: 1150, step size: 115
Probe size: 1100, step size: 110
Probe size: 1050, step size: 105
Probe size: 1000, step size: 100
Probe size: 950, step size: 95
Probe size: 900, step size: 90
Probe size: 850, step size: 85
Probe size: 800, step size: 80
Probe size: 750, step size: 75
Probe size: 700, step size: 70
Probe size: 650, step size: 65
Probe size: 600, step size: 60
Probe size: 550, step size: 55
Probe size: 500, step size: 50

----------------------------------------

Total jobs created: 23
Total sequences stored for searching: 458

=head1 CONTACT

G2C B<email> webmaster@genes2cognition.org

=cut


use strict;
use warnings;
use Bio::EnsEMBL::DBSQL::DBAdaptor;
use Carp;
use GeneTargetingDB;
use GeneTargeting::Utils qw( show_perldoc print_hash_ref interrogate_slice );
use GeneTargeting::Utils::Config;
use GeneTargeting::Utils::Exonerate;
use Getopt::Long;

#global
my $debug;

{
    my ($config_file, $probe_name, $probe_description, $end_bias, $chromosome
        , $start, $end, $strand);

    GetOptions(
        "config_file=s"       => \$config_file,
        "probe_name=s"        => \$probe_name,
        "probe_description=s" => \$probe_description,
        "end_bias=s"          => \$end_bias,
        "chromosome=s"        => \$chromosome,
        "start=i"             => \$start,
        "end=i"               => \$end,
        "strand=s"            => \$strand,
        "debug"               => \$debug,
        "help|h"              => \&show_perldoc
    ) or show_perldoc();

    show_perldoc("Must set --probe_name") unless $probe_name;
    show_perldoc("Must set --probe_description") unless $probe_description;
    if ($end_bias) {
        unless ($end_bias =~ /^5prime$|^3prime$/) {
            show_perldoc("If specified, --end_bias must be '5prime' or '3prime'")
        }
    }
    if ($strand) {
        unless ($strand == 1 or $strand == -1) {
            show_perldoc("--stand must be set to +1 or -1");
        }
    } else {
        $strand = 1;
    }
    show_perldoc("Must set --chromosome") unless $chromosome;
    show_perldoc("Must set --start") unless $start;
    show_perldoc("Must set --end") unless $end;
    
    #Make the database connections
    my $cfg = GeneTargeting::Utils::Config->new($config_file, $debug);
    print "Ensembl database --\n";
    my ($ensembl_dba) = $cfg->make_DBAdaptor_from_config(1);
    print "GeneTargeting analysis database --\n";
    my ($GeneTargeting_dba) = $cfg->make_analysis_DBAdaptor_from_config(1);

    my ($dna_probe_aptr, $ext_db_aptr, $conf_aptr, $seq_aptr, $job_aptr)
        = get_GeneTargeting_adaptors($GeneTargeting_dba);

    # Check whether ExternalDB and search Conf entries exist in the GeneTargeting
    # database, if not, create them from the information specifed in the 
    # [exonerate_genome] section of the config file
    my ($ext_db, $conf) = get_ExternalDB_and_Conf($GeneTargeting_dba, $cfg);

    #Validate the genomic coordinates, and get slice we need
    my $slice = get_Slice_of_design_window($ensembl_dba, $cfg
        , $chromosome, $start, $end, $strand);
    
    
    
    my $dna_probe = create_DNAProbe($GeneTargeting_dba, $cfg, $slice
        , $probe_name, $probe_description, $end_bias);
    
    create_DNAProbe_search($GeneTargeting_dba, $cfg, $conf
        , $slice, $dna_probe);
}


sub create_DNAProbe {
    my ($GeneTargeting_dba, $cfg, $slice, $probe_name, $probe_description
        , $end_bias) = @_;

    my ($dna_probe_aptr, $ext_db_aptr, $conf_aptr, $seq_aptr, $job_aptr)
        = get_GeneTargeting_adaptors($GeneTargeting_dba);
    if ($dna_probe_aptr->fetch_by_name($probe_name)) {
        confess "DNAProbe called '$probe_name' already exists";
    }

    my $dna_probe = GeneTargeting::DBEntry::DNAProbe->new;
    if (my $assembly = $slice->coord_system->version) {
        $dna_probe->assembly($assembly);
    } else {
        confess "Could not get assembly (coord_system->version)"
            . "from Slice object";
    }
    if ($slice->coord_system_name eq 'chromosome'
        and $slice->seq_region_name) {
        $dna_probe->chromosome($slice->seq_region_name);
    } else {
        confess "Cannot set chromosome from Slice object";
    }

    if ($slice->start) {
        $dna_probe->start($slice->start)
    } else {
        confess "Cannot set start from Slice object";
    }
    if ($slice->end) {
        $dna_probe->end($slice->end);
    } else {
        confess "Cannot set end from Slice object";
    }
    if ($slice->strand) {
        $dna_probe->strand($slice->strand);
    } else {
        confess "Cannot set strand from Slice object";
    }
    $dna_probe->strand($slice->strand);
    $dna_probe->end_bias($end_bias);
    $dna_probe->min_length($cfg->genomic_probe_min);
    $dna_probe->max_length($cfg->genomic_probe_max);
    $dna_probe->name($probe_name);
    $dna_probe->description($probe_description);
    $dna_probe_aptr->store($dna_probe);
    
    print "Stored as DNAProbe '$probe_name' (with id: ", $dna_probe->id, ")\n";
    return $dna_probe;
}

sub create_DNAProbe_search {
    my ( $GeneTargeting_dba, $cfg, $conf, $slice, $dna_probe ) = @_;

    my ($dna_probe_aptr, $ext_db_aptr, $conf_aptr, $seq_aptr, $job_aptr)
        = get_GeneTargeting_adaptors($GeneTargeting_dba);

    my $probe_size = $cfg->genomic_probe_max;
    print "Design window length: ", $slice->length, "\n";

    my $job;
    my $job_count = 0;
    my $seqs_stored_for_job = 0;
    my $total_seqs_stored = 0;
    
    #Set the design direction to a default if it was not specified
    my $end_bias;
    $end_bias = $dna_probe->end_bias or $end_bias = '5prime';

    if ($end_bias eq '5prime') {
    
        print "Designing sequences with 5' bias in window\n" if $dna_probe->end_bias;
        while ($probe_size >= $cfg->genomic_probe_min) {

            my $design_base = 1;
            my $bp_step_size  =
                int($probe_size * $cfg->genomic_probe_frac_window_slide);
            print "\nProbe size: $probe_size, step size: $bp_step_size\n\n";

            while ($design_base + $probe_size <= $slice->length) {

                unless ($job) {
                    $job = create_and_store_job($job_aptr, $conf);
                    $job_count++;
                }
                my $probe_slice = $slice->sub_Slice($design_base
                        , $design_base + $probe_size - 1);
                
                my $seq = GeneTargeting::DBEntry::Sequence->new_from_Slice($probe_slice);
                $seq_aptr->store($seq);
                $job_aptr->store_Job_Sequence($job, $seq);
                $dna_probe_aptr->store_DNAProbe_Sequence($dna_probe, $seq);
                $seqs_stored_for_job++;
                $total_seqs_stored++;
                if ($seqs_stored_for_job >= $cfg->exonerate_genome_job_size) {
                    $job = undef;
                    $seqs_stored_for_job = 0;
                }
                $design_base += $bp_step_size;
                print "  Made sequence from: ", $probe_slice->name
                    , " (length: ", $probe_slice->length, ")\n" if $debug;
            }
            $probe_size -=  $cfg->genomic_probe_size_decrement;
        }
    } else { #3prime end_bias

        print "Designing sequences with 3' bias in window\n" if $dna_probe->end_bias;
        while ($probe_size >= $cfg->genomic_probe_min) {

            my $design_base = $slice->length;
            my $bp_step_size  =
                int($probe_size * $cfg->genomic_probe_frac_window_slide);
            print "\nProbe size: $probe_size, step size: $bp_step_size\n\n";

            while ($design_base - $probe_size + 1 >= 1) {

                unless ($job) {
                    $job = create_and_store_job($job_aptr, $conf);
                    $job_count++;
                }
                my $probe_slice = $slice->sub_Slice($design_base - $probe_size + 1
                        , $design_base);
                
                my $seq = GeneTargeting::DBEntry::Sequence->new_from_Slice($probe_slice);
                $seq_aptr->store($seq);
                $job_aptr->store_Job_Sequence($job, $seq);
                $dna_probe_aptr->store_DNAProbe_Sequence($dna_probe, $seq);
                $seqs_stored_for_job++;
                $total_seqs_stored++;
                if ($seqs_stored_for_job >= $cfg->exonerate_genome_job_size) {
                    $job = undef;
                    $seqs_stored_for_job = 0;
                }
                $design_base -= $bp_step_size;
                print "  Made sequence from: ", $probe_slice->name
                    , " (length: ", $probe_slice->length, ")\n" if $debug;
            }
            $probe_size -=  $cfg->genomic_probe_size_decrement;
        }
    }
    print '  ', '-' x 40, "\n\n";
    print "Total jobs created: $job_count\n";
    print "Total sequences stored for searching: $total_seqs_stored\n";
}

sub get_Slice_of_design_window {
    my ( $ensembl_dba, $cfg, $chromosome, $start, $end, $strand ) = @_;
    
    confess "--start must be >0" unless $start > 0;
    confess "--end must be > 0" unless $end > 0;
    confess "--end must be greater than --start" unless $end > $start;
    
    my $window_length = $end - $start + 1;
    
    unless ($window_length > $cfg->genomic_probe_min) {
        confess "--start and -end must specify a region of the genome"
            . " greater than [genomic_probe] min in the .ini file";
    }
    
    unless ($window_length > $cfg->genomic_probe_max) {
        confess "--start and --end must specify a region of the genome"
            . " greater than [genomic_probe] max in the .ini file"
    }
    
    my $slice_aptr = $ensembl_dba->get_SliceAdaptor()
        or confess "Could not get SliceAdaptor";
        
    my $slice = $slice_aptr->fetch_by_region('chromosome', $chromosome, $start
        , $end, $strand) or confess "Failed to get slice with fetch_by_region";
    
    #Check coordinates are on chromosome
    my $chr_length = $slice->seq_region_length();
    if ($start > $chr_length and $end > $chr_length) {
        confess "Both specified --start and --end are greater than chromosome length ($chr_length bp)";
    }
    if ($start > $chr_length) {
        confess "Specified --start is greater than chromosome length ($chr_length bp)";
    }
    if ($end > $chr_length) {
        confess "Specified --end is greater than chromosome length ($chr_length bp)";
    }
    print "Coordinates checkout OK with chromosome length\n\n";
    print "Designing probe in this region: \n";
    interrogate_slice($slice);
    return $slice;
}

sub get_ExternalDB_and_Conf {
    my ( $GeneTargeting_dba, $cfg ) = @_;
    
    my ($dna_probe_aptr, $ext_db_aptr, $conf_aptr, $seq_aptr, $job_aptr)
        = get_GeneTargeting_adaptors($GeneTargeting_dba);

    my $ext_db;
    unless ($ext_db = $ext_db_aptr->fetch_by_name($cfg->exonerate_genome_db_name)) {
        $ext_db = GeneTargeting::DBEntry::ExternalDB->new;
        $ext_db->db_name($cfg->exonerate_genome_db_name);
        $ext_db->display_label($cfg->exonerate_genome_display_label);
        $ext_db->sequence_source($cfg->exonerate_genome_sequence_source);
        $ext_db->description($cfg->exonerate_genome_description);
        $ext_db_aptr->store($ext_db);
        print 'Stored genome ExternalDB     : ', $ext_db->db_name
            , " (with id: ", $ext_db->id, ")\n";
    } else {
        print 'Fetched genome ExternalDB    : '
            , $ext_db->db_name, "\n";
    } 

    my $conf;
    unless ($conf = $conf_aptr->fetch_by_name($cfg->exonerate_genome_search_name)) {
        $conf = GeneTargeting::DBEntry::Conf::Exonerate->new;
        $conf->name($cfg->exonerate_genome_search_name);
        $conf->description('Search against ' . $cfg->exonerate_genome_description);
        $conf->fasta_dir($cfg->exonerate_genome_dir);
        $conf->ext_db_id($ext_db->id);
        $conf->args($cfg->exonerate_genome_args);
        $conf->query_batch_size($cfg->exonerate_genome_query_batch_size);
        $conf_aptr->store($conf);
        print 'Stored exonerate search Conf : ', $conf->name
            , " (with id: ", $conf->id, ")\n";
    } else {
        print 'Fetched exonerate search Conf: '
            , $cfg->exonerate_genome_search_name, "\n";
    }
    print "\n";
    return ($ext_db, $conf);
}

sub get_GeneTargeting_adaptors {
    my ( $dba ) = @_;

    my $dna_probe_aptr = $dba->get_DNAProbeAdaptor()
        or confess "Could not get DNAProbeAdaptor";
    my $ext_db_aptr = $dba->get_ExternalDBAdaptor()
        or confess "Could not get ExternalDBAdaptor";
    my $conf_aptr   = $dba->get_ConfAdaptor()
        or confess "Could not get ConfAdaptor";
    my $seq_aptr  = $dba->get_SequenceAdaptor()
        or confess "Could not get SequenceAdaptor";
    my $job_aptr  = $dba->get_JobAdaptor()
        or confess "Could not get JobAdaptor";
        
    return ($dna_probe_aptr, $ext_db_aptr, $conf_aptr, $seq_aptr
        , $job_aptr);
}

sub create_and_store_job {
    my ( $job_aptr, $conf ) = @_;
    
    my $job = GeneTargeting::DBEntry::Job->new;
    $job->analysis_conf_id($conf->id);
    $job->state('created');
   
    $job_aptr->store($job);
    print "Stored job with id: ". $job->id, "\n" if $debug;
    return $job;
}

###  configure_parameters
#
# Specifies the mandatory and optional parameters that can (or must)
# be specified in the configuration .ini file for the programme
# (see create_probe_search_defaults.ini)

sub configure_parameters {

    my $mandatory = {};
    my $optional  = {};

    $mandatory->{'ensembl_database'}  = ['db', 'host', 'user'];
     $optional->{'ensembl_database'}  = ['pass', 'port'];

    $mandatory->{'analysis_database'} = ['db', 'host', 'user'];
     $optional->{'analysis_database'} = ['pass', 'port'];
    

    $mandatory->{'exonerate_genome'}  = ['db_name', 'search_name'
        , 'display_label', 'sequence_source', 'description', 'dir'
        , 'job_size', 'args', 'query_batch_size'];
        
    $mandatory->{'genomic_probe'}     = ['min', 'max'
        , 'frac_window_slide', 'size_decrement'];
    return ($mandatory, $optional);
}

