#!/usr/local/bin/perl
#
# Copyright 2005-2009 Genes to Cognition Programme (G2C) and 
# Genome Research Limited (GRL)
#
# Contact: webmaster@genes2cognition.org
# See    : www.genes2cognition.org/software/southern_blot
#
# You may distribute this file/module under the terms of the Perl artistic
# licence: http://www.perlfoundation.org/artistic_license_2_0

=pod

=head1 NAME - analyse_probe_search

=head1 COMMAND LINE PARAMETERS

Required parameters
  --config_file                 
  --probe_name                } these are mutually
  --all_probes                } exclusive
   

Optional parameters
  --output_file               specify an output file name other
                              than the default
  --clobber                   allows overwriting of output html/graphic
                              file if it already exists
  --debug
  --help|h              

=head1 CONFIGURATION FILE

See analyse_probe_search.ini for more configuration details.

=head1 DESCRIPTION

Final step in the automated Southern blot design process. Checks all jobs
for the probe design are complete, then analyses all the search results
reporting those putative probes that exceed the selection criteria specified.

Designs primers with Primer3 for all the successfull probes. Generates a
graphic html report of the analysis.

=head1 EXAMPLE RUN

./analyse_probe_search --config_file=analyse_probe.ini
--probe_name=test

GeneTargeting analysis database --
Connected to host: host, as user: user
Database       : southern_blot_design

Analysing probe: test
  assembly      : NCBIM37
  bias          : 5prime
  chromosome    : 2
  description   : testing
  end           : 25212733
  id            : 1
  name          : test
  start         : 25209733
  strand        : 1
  -------------
Generating image
  - done
Generating primers
Primer3 parameters used
------------------

  PRIMER_GC_CLAMP     : 1
  PRIMER_MAX_GC       : 50
  PRIMER_MAX_SIZE     : 22
  PRIMER_MAX_TM       : 60
  PRIMER_MIN_GC       : 40
  PRIMER_MIN_SIZE     : 20
  PRIMER_MIN_TM       : 55
  PRIMER_NUM_RETURN   : 5
  PRIMER_OPT_SIZE     : 21
  PRIMER_SALT_CONC    : 50
  -------------------
Wrote: /southern_blot_design/output/test_probe_search_primers.html

  - done
Wrote: /southern_blot_design/output/test_probe_search.html


=head1 CONTACT

G2C B<email> webmaster@genes2cognition.org

=cut


use strict;
use warnings;
use Bio::Graphics;
use Bio::SeqFeature::Generic;
use Carp;
use Getopt::Long;
use GeneTargetingDB;
use GeneTargeting::Utils qw( show_perldoc print_hash_ref get_temp_dir
    get_temp_filename output_GD get_max_and_min_from_arrayref
    format_string_sixty_chrs_per_line);
use GeneTargeting::Utils::Config;
use GeneTargeting::Utils::GD;
use GeneTargeting::Utils::HTMLReport;
use GeneTargeting::Utils::Primer3;

#Globals
my ($debug , $html, $html_primer, $tmp_file, $clobber);

{
    my ( $probe_name, $all_probes, $config_file, $debug );    
    GetOptions(
        "probe_name=s"      => \$probe_name,
        "all_probes"        => \$all_probes,
        "config_file=s"     => \$config_file,
        "debug"             => \$debug,
        "clobber"           => \$clobber,
        "help|h"            => \&show_perldoc,
    ) or show_perldoc();
    
    if ($probe_name and $all_probes) {
        show_perldoc("Must set one of --probe_name or --all_probes");
    }
    unless ($probe_name or $all_probes) {
        show_perldoc("Must set one of --probe_name or --all_probes");
    }
    
    print "$0 : ", scalar(localtime), "\n\n";
    print "GeneTargeting analysis database --\n";
    my $cfg = GeneTargeting::Utils::Config->new($config_file, $debug);
    my ($GeneTargeting_dba, $db_txt) = $cfg->make_analysis_DBAdaptor_from_config(1);

    #tmp filename for Primer3
    $tmp_file = get_temp_dir() . '/' . get_temp_filename();
    
    #Get the adaptors we need.
    my ($probe_aptr, $seq_aptr, $hit_aptr, $conf_aptr)
        = get_adaptors($GeneTargeting_dba);
    
    #Get the probes
    my $probes = get_DNAProbes($GeneTargeting_dba, $all_probes, $probe_name);
    foreach my $probe (@$probes) {

        $probe_name = $probe->name;
        print "Analysing probe: $probe_name\n";
        
        my $output_file = $probe_name . '_probe_search.html';
        
        $html = GeneTargeting::Utils::HTMLReport->new;
        $html->write_mode('clobber') if $clobber;
        $html->title("$probe_name - Southern blot probe search results");
        $html->append_line("<pre><b>$0 : " . scalar(localtime) . "</b>\n");
        $html->append($db_txt);
        
        $html->append_line("Fetched probe design: $probe_name (id: " . $probe->id . ")\n");
        $html->append(print_hash_ref($probe, 'truncate'));
        $html->append_line("\n");

        $html->append_line("Design window length: ");
        my $probe_window_length = $probe->end - $probe->start + 1;
        $html->append_line($probe_window_length . "bp\n\n");
    
        #Make sure all Jobs are successful before proceeding, and get the Conf
        if (my $conf = check_state_of_Jobs_for_DNAProbe($GeneTargeting_dba, $probe)) {

            analyse_putative_probe_sequences($GeneTargeting_dba, $cfg, $probe, $conf, $db_txt);
            $html->append_line('</pre>');
            $html->write($cfg->output_dir_loc . '/' . $output_file);
            print "Wrote: ", $cfg->output_dir_loc . '/' . $output_file, "\n\n";        
        } else {
            print "Skipping\n";
        }
    }
   
    END {
        unlink $tmp_file if !$debug and $tmp_file and -e $tmp_file;
    }    
}

sub get_DNAProbes {
    my ( $dba, $all_probes, $probe_name ) = @_;

    my ($probe_aptr, $seq_aptr, $hit_aptr, $conf_aptr)
        = get_adaptors($dba);
    
    my $probes;
    if ($all_probes) {
        $probes = $probe_aptr->fetch_all();
        unless ($probes) {
            confess "Fetched no DNAProbes for analysis";
        }
    } else {
        my $probe = $probe_aptr->fetch_by_name($probe_name);
        confess "Could not fetch probe by name '$probe_name'"
            unless $probe;
        $probes = [];
        push(@$probes, $probe);
    }
    return $probes;    
}

sub make_probe_image_2 {
    my ( $cfg, $probe, $unique_seqs, $passed_seqs, $failed_seqs
        , $img_width, $img_height ) = @_;
    
    $img_height = 250 unless $img_height;
    $img_width  = 800 unless $img_width;
    
    my $file_name;
    {
        my $gd = makeGD($img_width, $img_height, 'white', 1);
        $gd->filledRectangle(0,0,$img_width - 1,$img_height - 1, GD_colour('white'));
        
        set_x_scale_factor(int(($probe->end - $probe->start + 1) / $gd->width) + 1);

        my $failed_base_counts =
            calculate_frequency_of_sequences_in_probe_window($probe, $failed_seqs);
        my $passed_base_counts =
            calculate_frequency_of_sequences_in_probe_window($probe, $passed_seqs);
        my $unique_base_counts = 
            calculate_frequency_of_sequences_in_probe_window($probe, $unique_seqs);
        set_y_scale_factor($gd, $failed_base_counts, $passed_base_counts);
        base_plot($gd, $probe, $failed_base_counts, 'red');
        base_plot($gd, $probe, $passed_base_counts, 'green');
        base_plot($gd, $probe, $unique_base_counts, 'yellow');

        add_scale($gd, $probe);
        $file_name = output_GD($gd, $cfg->output_dir_loc . '/' . $probe->name . '_diag'
            , 'png', $clobber);
    }
    
    if (rindex($file_name, '/')) {
        $file_name = substr($file_name, rindex($file_name, '/') + 1);
    }
    return $file_name;
}


{
    my ($x_scale_factor, $y_scale_factor);
    sub set_x_scale_factor {
        my ( $value ) = @_;
        $x_scale_factor = $value;
    }
    
    sub set_y_scale_factor {
        my ( $gd, $failed_base_counts, $passed_base_counts ) = @_;
        
        my ($failed_max_y) = get_max_and_min_from_arrayref($failed_base_counts);
        my ($passed_max_y) = get_max_and_min_from_arrayref($passed_base_counts);
        
        my $max_y = $failed_max_y;
        if ($passed_max_y > $failed_max_y) {
            $max_y = $passed_max_y;
        }
        $y_scale_factor = $max_y / ($gd->height - 50);
    }

    sub add_scale {
        my ( $gd, $probe ) = @_;

        my $probe_window_length = $probe->end - $probe->start + 1;

        my $tick_interval = 100;
        while ($probe_window_length / $tick_interval > 22) {
            $tick_interval += 100;
        }

        my $dna_pos = 0;
        my $x_start = 0;

        my $y1_big_tick = $gd->height - 45;
        my $y2_big_tick = $gd->height - 35;
        my $y1_small_tick = $gd->height - 45;
        my $y2_small_tick = $gd->height - 40;
        my $y_font = $gd->height - 40;
        my $font = GD::Font->Small;
        my $black = GD_colour('black');

        $gd->line(0, $gd->height - 45, $gd->width - 1, $gd->height - 45, $black);
        while ($dna_pos < $probe_window_length) {
            if ($dna_pos % ($tick_interval * 10)) {
                $gd->line($x_start + $dna_pos / $x_scale_factor, $y1_small_tick,
                    $x_start + $dna_pos / $x_scale_factor, $y2_small_tick, $black);
            } else {
                $gd->line($x_start + $dna_pos / $x_scale_factor, $y1_big_tick,
                    $x_start + $dna_pos / $x_scale_factor, $y2_big_tick, $black);
            }

            $gd->string($font, $x_start + $dna_pos / $x_scale_factor 
                - (length($dna_pos) - 1) * 6, $y_font + 6, $dna_pos, $black);
            $dna_pos += $tick_interval;
        }
    }

    sub calculate_frequency_of_sequences_in_probe_window {
        my ( $probe, $sequences ) = @_;
        
        my $probe_window_length = $probe->end - $probe->start + 1;
        my $offset = $probe->start;

        # Initialise the array so count starts from zero at every base
        # position in the probe window
        my $base_positions = [];
        for (my $i = 0; $i < $probe_window_length; $i++) {
            $base_positions->[$i] = 0;
        } 

        # Build up the counts by iterating over the sequences, and every
        # base in each sequence
        foreach my $seq (@$sequences) {
            for (my $i = $seq->start; $i <= $seq->end; $i++) {
                $base_positions->[$i - $offset]++;
            }
        }
        return ($base_positions);
    }

    sub base_plot {
        my ($gd, $probe, $base_counts, $colour) = @_;

        # Draw the lines by averaging across the base positions to be
        # represented a single pixel
        my $base_position = 0;
        foreach (my $x = 0; $x < $gd->width; $x++) {

            my $total_count = 0;
            for (my $i = 0; $i < $x_scale_factor; $i++) {
                $total_count += $base_counts->[$base_position + $i]
                    if $base_position + $i < $#$base_counts;
                $base_position++;
            }

            my $average_count = $total_count / $x_scale_factor;
            if ($average_count) {

                #This is nasty and needs to be fixed
                if ($y_scale_factor == 0) {
                    print "**Caution y_scale_factor was 0\n";
                    $y_scale_factor = 1;
                }
            
                $gd->line($x, $gd->height - 51, $x
                    , ($gd->height - 51) - $average_count / $y_scale_factor, GD_colour($colour));
            }
        }
    }
}

sub analyse_putative_probe_sequences {
    my ( $dba, $cfg, $probe, $conf, $db_txt ) = @_;

    my ($probe_aptr, $seq_aptr, $hit_aptr) = get_adaptors($dba);
    my $seq_ids = $probe_aptr->get_Sequence_ids($probe)
        or confess "Could not get_Sequence_ids for probe '", $probe->name, "'";
    $html->append_line("Fetched " . scalar(@$seq_ids)
        . " putative probes to analyse\n\n");
    output_threshold_criteria($cfg, $probe);

    my $score_ratio_cutoff = $cfg->criteria_score_ratio;
    my $percent_repetitive_cutoff = $cfg->criteria_percent_repetitive;

    my $seqs_without_hits = 0;
    my $sequences_with_single_self_hit = 0;
    my $sequences_with_no_self_hit = 0;
    my $sequences_unique = 0;
    
    my @sequences;
    foreach my $seq_id (@$seq_ids) {

        my $found_self_hit = 0;
        my $sequence = $seq_aptr->fetch_by_db_id($seq_id)
            or confess "Could not fetch Sequence by id '$seq_id'";
        push (@sequences, $sequence);
        
        my $hits = $hit_aptr->fetch_by_Sequence_id_Conf_id($seq_id, $conf->id);
        if ($hits) {
            if (add_self_hit($sequence, $hits)) {
                $sequences_with_single_self_hit++;
                unless (add_score_ratio($sequence, $hits)) {
                    $sequences_unique++;
                }
                add_distance_from_ideal($probe, $sequence) if $probe->end_bias;
                add_percent_repetitive($sequence);
                add_overall_score($probe, $sequence);
            } else {
                $sequences_with_no_self_hit++;
            }
        } else {
            $seqs_without_hits++;
        }
    }
    
    $html->append_line("Putative probes without hits                    : "
        . "$seqs_without_hits\n");
    $html->append_line("Putative probes with single_self_hit            : "
        . $sequences_with_single_self_hit . '/' . scalar(@sequences) . "\n");
    $html->append_line("Putative probes that are unique                 : "
        . $sequences_unique . '/' . scalar(@sequences) . "\n");
    
    # Check if all sequences had a single self-hit, if not, write the html (so far)
    # and terminate
    unless ($sequences_with_single_self_hit == scalar(@sequences)) {
        print STDERR "Analyis failed as not all putative probes have a single self-hit";
        return;
    }
    
    
    my ($passed_sequences, $failed_sequences, $unique_sequences) =
        filter_by_unique_or_score_and_repetitive_criteria($cfg, \@sequences);
    unless ($unique_sequences or $passed_sequences) {
        warn "No sequences pass with the chosen criteria - relaxing selection criteria";
        $html->append_line("\n  <b>No putative probes pass with the above criteria\n");
        $html->append_line("  Now trying: score-ratio 2, and percent repetitive 40</b>\n\n");
        $cfg->criteria_score_ratio(2);
        $cfg->criteria_percent_repetitive(40);
        ($passed_sequences, $failed_sequences, $unique_sequences)
            = filter_by_unique_or_score_and_repetitive_criteria($cfg, \@sequences);
    }
    unless ($unique_sequences or $passed_sequences) {
        warn "No sequences pass with the relaxed criteria - relaxing selection criteria";
        $html->append_line("\n  <b>No putative probes pass with the relaxed criteria\n");
        $html->append_line("  Now trying: score-ratio 2, and percent repetitive 80</b>\n\n");
        $cfg->criteria_score_ratio(2);
        $cfg->criteria_percent_repetitive(80);
        ($passed_sequences, $failed_sequences, $unique_sequences)
            = filter_by_unique_or_score_and_repetitive_criteria($cfg, \@sequences);
    }
    unless ($unique_sequences or $passed_sequences) {
        confess "No putative probes pass even with relaxed selection criteria";
    }

    my $total_passed_seqs = 0;
    $total_passed_seqs += scalar(@$unique_sequences) if $unique_sequences;
    $total_passed_seqs += scalar(@$passed_sequences) if $passed_sequences;
    
    $html->append_line("Putative probes exceeding selection criteria    : "
        . $total_passed_seqs . "/" .  scalar(@sequences) . " (both unique and not unique)\n");

    #Generate graphic
    print "Generating image\n";
    my $graphic_file = make_probe_image_2($cfg, $probe, $unique_sequences
        , $passed_sequences, $failed_sequences);
    print "  - done\n";
    
    #Filter out the 'redundant' sequences to separate list
    my ($deleted_count, $redundant_sequences)
        = remove_overlapping_sequences($probe, $passed_sequences);

    #Sort the various sets of putative probes
    my ($sorted_unique_sequences, $sorted_passed_sequences, $sorted_redundant_sequences);
    $sorted_unique_sequences     = sort_unique_sequences($unique_sequences) if $unique_sequences;
    $sorted_passed_sequences     = sort_sequences($passed_sequences) if $passed_sequences;
    $sorted_redundant_sequences  = sort_sequences($redundant_sequences) if $redundant_sequences;

    #Generate primers
    print "Generating primers\n";
    my $successful_primer_picks = generate_primers($cfg, $sorted_unique_sequences
        , $sorted_passed_sequences, $sorted_redundant_sequences, $probe, $db_txt);
    print "  - done\n";
    #my $total_passed_seqs = 0;
    #$total_passed_seqs += scalar(@$sorted_unique_sequences) if $sorted_unique_sequences;
    #$total_passed_seqs += scalar(@$sorted_passed_sequences) if $sorted_passed_sequences;
    #$total_passed_seqs += scalar(@$sorted_redundant_sequences) if $sorted_redundant_sequences;
    
    $html->append_line("Putative probes with successfully-picked primers: $successful_primer_picks"
        . "/" . $total_passed_seqs . "\n");
    
    $html->append_line("\nIdentified $deleted_count 'redundant' probes exceeding selection criteria\n"
        . "(but wholly contained by other probes, and higher repetitive DNA content)\n\n\n");

    #Include the graphic
    $html->append_line("UNIQUE (YELLOW), PASSED (GREEN) AND FAILED PUTATIVE PROBES IN THE DESIGN WINDOW\n");
    $html->append_line('-' x 79 . "\n\n");
    $html->append_line('<center><img src="' . $graphic_file . '" alt="probe image"></center><br>');
    
    generate_html_output($probe, $sorted_unique_sequences, $sorted_passed_sequences
        , $sorted_redundant_sequences);
    
    #Restore these values in case they were changed
    $cfg->criteria_score_ratio($score_ratio_cutoff);
    $cfg->criteria_percent_repetitive($percent_repetitive_cutoff);

}

sub make_probe_image {
    my ( $probe, $passed_seqs, $failed_seqs, $img_width ) = @_;
    
    confess "Dont use";
    
    my $total_seqs = 0;
    $total_seqs += scalar(@$passed_seqs) if $passed_seqs;
    $total_seqs += scalar(@$failed_seqs) if $failed_seqs;
    
    my $img_height = $total_seqs + 100;
    $img_width  = 600 unless $img_width;
    
    my $probe_window_length = $probe->end - $probe->start + 1;
    my $x_factor = $probe_window_length / $img_width;
    
    my $gd = makeGD($img_width, $img_height);
    
    my $col_scheme = 'pure_green_ext';
    my ($hi_passed, $lo_passed) = get_score_extremes($passed_seqs);
    my ($hi_failed, $lo_failed) = get_score_extremes($failed_seqs);    
    set_colour_intervals_for_scheme($col_scheme, ($lo_failed, $hi_passed));
    my $black = GD_colour('black');
    
    my $y_pos = 0;
    foreach my $seq (sort {$a->start <=> $b->start
        or $b->end - $b->start <=> $a->end - $a->start} @$passed_seqs) {
    
        my $x1_pos = ($seq->start - $probe->start) / $x_factor;
        my $x2_pos = ($seq->end - $probe->start) / $x_factor;
        $gd->line($x1_pos, $y_pos, $x2_pos, $y_pos, get_graded_colour($seq, $col_scheme));
        $y_pos += 1;    
    }
    
    $y_pos += 10;
    for (my $i = 0; $i <= 2; $i++) {
        $gd->line(0, $y_pos, $img_width - 1, $y_pos, $black);
        $y_pos++;
    }
    $y_pos += 10;
    
    foreach my $seq (sort {$a->start <=> $b->start
        or $b->end - $b->start <=> $a->end - $a->start} @$failed_seqs) {
    
        my $x1_pos = ($seq->start - $probe->start) / $x_factor;
        my $x2_pos = ($seq->end - $probe->start) / $x_factor;
        $gd->line($x1_pos, $y_pos, $x2_pos, $y_pos, get_graded_colour($seq, $col_scheme));
        $y_pos += 1;    
    }
    my $file_name = $probe->name . '_diag_2';
    print "File: ", output_GD($gd, $file_name), "\n";
}    


sub set_colour_intervals_for_scheme {
    my ( $passed_scheme_name, $low, $high ) = @_;
    
    my $colour_scheme = GD_colour_scheme($passed_scheme_name);
    
    my $abs_range = abs($low - $high);
    my $step = $abs_range / $#$colour_scheme;
    
    my $current_score = $low;
    foreach my $gd_colour_array (@$colour_scheme) {
        $gd_colour_array->[0] = $current_score;
        $current_score += $step;
    }
}

sub get_graded_colour {
    my ( $seq, $scheme, $inverse ) = @_;
    
    my $colour_scheme = GD_colour_scheme($scheme);
    
    my $i = 0;
    while ($i <= $#$colour_scheme and $seq->overall_score > $colour_scheme->[$i]->[0]) {
        $i++;
    }
    $i = $#$colour_scheme if $i > $#$colour_scheme;
    
    my $colour = $colour_scheme->[$i]->[1];
    unless ($colour) {
        confess "Tried to return colour indexed: $i";
    } 
    return $colour;
}

sub get_score_extremes {
    my ( $seqs ) = @_;
    
    my ($highest, $lowest);
    foreach my $seq (@$seqs) {
    	
        unless ($highest) {
            $highest = $seq->overall_score;
            $lowest  = $seq->overall_score;
            next;
        }
        if ($seq->overall_score < $lowest) {
            $lowest  = $seq->overall_score;
        }
        if ($seq->overall_score > $highest) {
            $highest = $seq->overall_score;
        }
    }
    return ($highest, $lowest);
}

sub generate_primers {
    my ( $cfg, $unique_sequences, $sequences
        , $redundant_sequences, $probe, $db_txt ) = @_;

    my $total_seqs_yielding_primers = 0;
    $html_primer = GeneTargeting::Utils::HTMLReport->new;
    my $probe_name = $probe->name;

    my $output_file = $probe_name . '_probe_search_primers.html';

    $html_primer->write_mode('clobber') if $clobber;
    $html_primer->title("$probe_name - Primers for putative probes");
    $html_primer->append_line("<pre><b>$0 : " . scalar(localtime) . "</b>\n");
    $html_primer->append($db_txt);

    $html_primer->append_line("Primers for recovering probe design: $probe_name (id: " . $probe->id . ")\n");
    $html_primer->append_line("\n");

    my $parameters = get_primer3_parameters_from_config($cfg);
    
    #Output the Primer3 parameters
    $html_primer->append_line("<b>Primer3 parameters</b>\n");
    print "Primer3 parameters used\n";
    print "------------------\n\n";
    my $primer_param_txt = print_hash_ref($parameters);
    $html_primer->append($primer_param_txt);
    $html_primer->append_line("\n");
    
    #Add the notes here
    $html_primer->append_line("NOTE Tm QUOTED IN RESULTS BELOW ARE ACCORDING TO THE FORMULA: ");
    $html_primer->append_line("  Tm = (length * 2) + (GC * 2) - 5\n\n");

    $html_primer->append_line("UNIQUE PUTATIVE PROBES\n");   
    $html_primer->append_line("----------------------\n\n");
    $total_seqs_yielding_primers += make_primers_for_seqs($unique_sequences, $parameters);

    $html_primer->append_line("\nPUTATIVE PROBES\n");   
    $html_primer->append_line("---------------\n\n");   
    $total_seqs_yielding_primers += make_primers_for_seqs($sequences, $parameters);
    
    $html_primer->append_line("\nREDUNDANT PROBES\n");   
    $html_primer->append_line("----------------\n\n");   
    $total_seqs_yielding_primers += make_primers_for_seqs($redundant_sequences, $parameters);
    
    $html_primer->append_line("</pre>");
    $html_primer->write($cfg->output_dir_loc . '/' . $output_file);
    print "Wrote: ", $cfg->output_dir_loc . '/' . $output_file, "\n\n";
    return $total_seqs_yielding_primers;        
}

{
    my $primer3_call_count;
    
    sub make_primers_for_seqs {
        my ( $sequences, $parameters ) = @_;

        my $sequences_yielding_primers = 0;
        foreach my $sequence (@$sequences) {

            #Custom set the size range of products to be allowed by
            #Primer3 based on the length of the probe
            my $product_size_wobble;
            my $dna_length = length($sequence->dna);
            if ($dna_length >= 900) {
                $product_size_wobble = 300;
            } elsif ($dna_length >= 700) {
                $product_size_wobble = 200
            } else {
                $product_size_wobble = 150;    
            }

            my $primer3_results = design_with_primer3($sequence, $tmp_file
                , $parameters, $product_size_wobble);
            $html_primer->append_line($html->make_anchor_name($sequence->id, '<b>Primers for ID: '
                . $sequence->id) . "</b>\n");

            if ($primer3_results) {
                output_primer3_results($sequence, $primer3_results, 2);
                $sequences_yielding_primers++;
            } else {
                $html_primer->append_line("\n" . ' ' x 5 . " ***NONE***\n\n");
                $html_primer->append_line('=' x 31 . "\n\n");
            }
            $primer3_call_count++;
            if ($primer3_call_count >= 500) {
                $html_primer->append_line("\nPrimer3 called 500 times - limit reached\n");
                last;
            }
        }
        return $sequences_yielding_primers;
    }
}

sub output_primer3_results {
    my ( $sequence, $primer3_results, $max_results ) = @_;
    
    if ($max_results > $primer3_results->number_of_results) {
        $max_results = $primer3_results->number_of_results;
    }
    
    for (my $i = 0; $i < $max_results ; $i++) {

        my $results = $primer3_results->primer_results($i);

        my $offset = $sequence->start - 1;
        my $left_chr_start  = $offset + $results->{PRIMER_LEFT_START};
        my $left_chr_end    = $offset + $results->{PRIMER_LEFT_END};
        my $right_chr_start = $offset + $results->{PRIMER_RIGHT_START};
        my $right_chr_end   = $offset + $results->{PRIMER_RIGHT_END};

        my $line1 = "  Primer left" . ' ' x 28 . "Primer right\n";
        my $line2_left  = "  start: $left_chr_start";
        my $line2_right = "  start: $right_chr_start\n"; 

        my $line3_left  = "  end  : $left_chr_end";
        my $line3_right = "  end  : $right_chr_end\n"; 

        my $line4_left  = "  tm   : $results->{PRIMER_LEFT_TM_RECALC}";
        my $line4_right = "  tm   : $results->{PRIMER_RIGHT_TM_RECALC}\n";
        
        my $line5_left  = "  seq  : $results->{PRIMER_LEFT_SEQUENCE}";
        my $line5_right = "  seq  : $results->{PRIMER_RIGHT_SEQUENCE}\n";

        $html_primer->append_line("Primer pair " . ($i + 1) .
            ", product size : $results->{PRIMER_PRODUCT_SIZE}\n");
        $html_primer->append_line($line1);
        $html_primer->append_line(sprintf("%-39s", $line2_left) . $line2_right);
        $html_primer->append_line(sprintf("%-39s", $line3_left) . $line3_right);
        $html_primer->append_line(sprintf("%-39s", $line4_left) . $line4_right);
        $html_primer->append_line(sprintf("%-39s", $line5_left) . $line5_right);
                
        #We need to output the actual product sequence here.
        my $product_sequence = substr($sequence->dna, $results->{PRIMER_LEFT_START} - 1
            , $results->{PRIMER_RIGHT_END} - $results->{PRIMER_LEFT_START} + 1);
        my $product_id = '>' . $sequence->id . '_primer_pair_' . ($i + 1) . '_product';    
        $html_primer->append_line("\n  $product_id\n");
        $html_primer->append_line(format_string_sixty_chrs_per_line($product_sequence, 2));
        
        if ($i + 1 < $max_results) {
            $html_primer->append_line('  ' . '-' x 20 . "\n\n");
        } else {
            $html_primer->append_line('  ' . '=' x 60 . "\n\n");
        }
    }
    
}

sub select_primers {
    my ( $sequence, $probe, $primer3_results ) = @_;
    
    my @sorted_primers;
    if ($probe->end_bias eq '5prime') {
        @sorted_primers = sort {$b->{'PRIMER_PRODUCT_SIZE'} <=> $a->{'PRIMER_PRODUCT_SIZE'}
            or $a->{'PRIMER_LEFT_START'} <=> $b->{'PRIMER_LEFT_START'}
        } @{$primer3_results};
    } 
    return \@sorted_primers;
}

sub output_threshold_criteria {
    my ( $cfg, $probe ) = @_;

    $html->append_line("Selection criteria to rank probes:\n");
    $html->append_line("  Minimum score ratio             : "
        . $cfg->criteria_score_ratio . "\n");
    $html->append_line("  (i.e. self-hit must score at least " 
        . $cfg->criteria_score_ratio . " times higher than the next best hit)\n");
    $html->append_line("  Maximum percent repetitive bases: "
        . $cfg->criteria_percent_repetitive . "\n");
    if ($probe->end_bias) {
        $html->append_line("  Favour probes to the " . $probe->end_bias
            . " end of design window\n");
    } else {
        $html->append_line("  No positional bias in selecting probes\n");
    }
    $html->append_line("  ------------------------\n\n");    
}

sub sort_sequences {
    my ( $sequences ) = @_;
    
    my @sorted_sequences = sort {$b->overall_score <=> $a->overall_score
        or $a->percent_repetitive <=> $b->percent_repetitive
        or $a->distance_from_ideal <=> $b->distance_from_ideal} @$sequences;
    return \@sorted_sequences;    
}

sub sort_unique_sequences {
    my ( $sequences ) = @_;
    
    confess "Must pass a reference to an array" unless $sequences;
    my @sorted_sequences = sort {$a->percent_repetitive <=> $b->percent_repetitive
        or length($b->dna) <=> length($a->dna)
        or $a->distance_from_ideal <=> $b->distance_from_ideal} @$sequences;
    return \@sorted_sequences; 
}

sub add_overall_score {
    my ( $probe, $sequence ) = @_;
    
    my $score;
    if ($sequence->score_ratio) {
        $score = $sequence->score_ratio;
    } else {
        $score = 40; #Unique
    }
    $score -= $sequence->percent_repetitive * 2;
    $sequence->overall_score($score);
}

sub add_percent_repetitive {
    my ( $sequence ) = @_;
    
    my $percent_repetitive = ($sequence->self_hit->soft_masked
            / length($sequence->dna)) * 100;
    
    $sequence->percent_repetitive($percent_repetitive);    
}

sub generate_html_output {
    my ($probe, $unique_sequences, $sequences, $removed_sequences) = @_;

    #Output any unique probes first
    $html->append_line("UNIQUE PUTATIVE PROBES\n");
    $html->append_line("----------------------\n\n");
    $html->append_line("Ranked by repetitive DNA content and length\n\n");
    output_html_summary_for_sequences($unique_sequences, $probe);
    
    $html->append_line("\nPUTATIVE PROBES\n");
    $html->append_line("---------------\n\n");
    $html->append_line("Ranked by ");

    $html->append_line("overall score = score_ratio - 2 * repetitive DNA content\n\n");
    output_html_summary_for_sequences($sequences, $probe);
    
    if ($removed_sequences) {
        $html->append_line("\nREDUNDANT PROBES\n");
        $html->append_line("----------------\n\n");
        output_html_summary_for_sequences($removed_sequences, $probe);
    }

    $html->append_line("\nUNIQUE PUTATIVE PROBES\n");
    $html->append_line("----------------------\n\n");
    output_html_dna_for_sequences($probe, $unique_sequences);
    
    $html->append_line("\nPROBE SEQUENCES\n");
    $html->append_line("---------------\n\n");
    output_html_dna_for_sequences($probe, $sequences);
    
    if ($removed_sequences) {
        $html->append_line("\nREDUNDANT PROBE SEQUENCES\n");
        $html->append_line("-------------------------\n\n");
        output_html_dna_for_sequences($probe, $removed_sequences);
    }
}

sub output_html_dna_for_sequences {
    my ( $probe, $sequences ) = @_;
    
    my $primer_output_file = $probe->name . '_probe_search_primers.html';
    foreach my $sequence (@$sequences) {
    
        my $seq_id = $sequence->id;
        $html->append_line($html->make_anchor_name($seq_id, '>' . $seq_id) . " ");
        $html->append_line($html->make_link($primer_output_file . '#' . $seq_id
            , "Get primers for probe ID: $seq_id sequence", "Get primers\n"));

        my $truncated_seq = $sequence->dna;
        my $formatted_seq;
        
        while ($truncated_seq =~ /(.{1,60})/g) {
            $formatted_seq .= "$1\n";
        }
        $html->append_line($formatted_seq . "\n");
    }
    
    #In some cases its getting an empty array rather than undef
    unless (@$sequences) {
        $html->append_line(" -- None\n");
    }
}

sub output_html_summary_for_sequences {
    my ( $sequences, $probe ) = @_;
    
    my $rank = 1;
    foreach my $sequence (@$sequences) {

        my $seq_length = length($sequence->dna);
        my $seq_id = $sequence->id;
        
        my $rank_string = ' ' x (3 - length($rank)) . $rank . ' ';
        $html->append_line($rank_string . $html->make_link('#' . $seq_id
            , "Get probe ID: $seq_id sequence", "ID: " . ' ' x (6 - length($seq_id))
            . $seq_id));
        $html->append_line(" length: " . ' ' x (4 - length($seq_length)) . $seq_length);
        unless ($sequence->is_unique) { 
            $html->append_line(" score ratio: " . sprintf("%4.1f", $sequence->score_ratio));
        } else {
            $html->append_line(" UNIQUE ");
        }
        $html->append_line(" actual: " . $sequence->self_hit->score);
        $html->append_line(' %repetitive DNA ');
        if ($sequence->percent_repetitive) {
            $html->append_line(sprintf("%4.1f" , $sequence->percent_repetitive));
        } else {
            $html->append_line('  - ');
        }
        
        if ($probe->end_bias) {
            my $distance_string = ' ' x (5 - length($sequence->distance_from_ideal)) 
                . $sequence->distance_from_ideal;
            $html->append_line(" distance: $distance_string ");
        }
        
        $html->append_line(" start: " . $sequence->start . " end: " . $sequence->end);
        unless ($sequence->is_unique) {
            $html->append_line(" overall: " . sprintf("%5.1f", $sequence->overall_score));
        }
        $html->append_line("\n");
        $rank++;
    }
    unless (@$sequences) {
        $html->append_line("  - None\n");
    }
}

sub check_state_of_Jobs_for_DNAProbe {
    my ( $dba, $probe ) = @_;
    
    my ($probe_aptr, $seq_aptr, $hit_aptr, $conf_aptr, $job_aptr)
        = get_adaptors($dba);
    
    my $job_ids = $job_aptr->get_ids_by_DNAProbe_name($probe->name)
        or confess "Got no job ids for ", $probe->name;
        
    my %job_state;
    my $bad_state;
    my $conf_id;
    foreach my $job_id (@$job_ids) {
        
        my $job = $job_aptr->fetch_by_db_id($job_id)
            or confess "Failed to get job with fetch_by_db_id: $job_id";
        $conf_id = $job->analysis_conf_id unless $conf_id;
        $bad_state++ unless $job->state eq 'success';
        $job_state{$job->state} ||= [];
        push(@{$job_state{$job->state}}, $job_id);
    }
    
    if ($bad_state) {
        print "Not all exonerate jobs for probe design are successful:\n\n";
        print "State        Job ids\n";
        print "-----        -------\n";
        foreach my $state (sort(keys(%job_state))) {
            print "$state:" , ' ' x (12 - length($state));
            foreach my $job_id (@{$job_state{$state}}) {
                print "$job_id ";
            }
            print "\n";
        }
        return;
    }
    
    #Else get the search Conf and return
    $html->append_line("All jobs for probe design are successful - total: "
        . scalar(@$job_ids) . "\n");
    my $conf = $conf_aptr->fetch_by_db_id($conf_id)
        or confess "Failed to fetch Conf with fetch_by_db_id '$conf_id'";
    
    $html->append_line('Fetched conf : ' . $conf->name . " (id: "
        . $conf->id . ")\n");
    $html->append_line('  Genome file(s) at: ' . $conf->fasta_dir . "\n");
    return $conf;
}

sub remove_overlapping_sequences {
    my ( $probe, $sequences ) = @_;
    
    my @removed_sequences;
    my $deleted_count = 0;
    my $loop = 1;
    while ($loop) {
    
        my $deleted_in_this_iteration;
        for (my $i = 0; $i <= $#$sequences; $i++) {

            my $candidate = $sequences->[$i];
            my ($overlapped, $overlapped_by_seq)
                = is_wholly_overlapped_seq($candidate, $sequences);
            if ($overlapped) {
                if ($candidate->percent_repetitive > $overlapped_by_seq->percent_repetitive) {
                    push (@removed_sequences, $candidate);
                    splice(@$sequences, $i, 1);
                    print "Deleted sequence Id: ", $candidate->id, "\n"
                        if $debug;
                    $deleted_in_this_iteration++;
                    $deleted_count++;
                    last;
                }
            }
        }
        $loop = 0 unless $deleted_in_this_iteration;
    }
    return ($deleted_count, \@removed_sequences);
}

sub is_wholly_overlapped_seq {
    my ( $test_sequence, $sequences ) = @_;
    
    my ($overlapped, $overlapped_by_seq);
    foreach my $sequence (@$sequences) {
        next if $sequence == $test_sequence;
        if ($test_sequence->start >= $sequence->start
            and $sequence->end >= $test_sequence->end) {
            $overlapped_by_seq = $sequence;
            $overlapped++;
            if ($debug) {
                print "Detected match between id: ", $test_sequence->id;
                print " and id: ", $sequence->id, "\n";
            }
            last;
        }
    }
    return ($overlapped, $overlapped_by_seq);
}

sub add_distance_from_ideal {
    my ( $probe, $sequence ) = @_;
    
    my $distance;
    if ($probe->end_bias eq '5prime') {
        $distance = $sequence->start - $probe->start;
    } elsif ($probe->end_bias eq '3prime') {
        $distance = $probe->end - $sequence->end;
    } else {
        confess "Cant interpret probe end bias: ", $probe->end_bias;
    }
    $sequence->distance_from_ideal($distance);
}

sub filter_by_unique_or_score_and_repetitive_criteria {
    my ( $cfg, $sequences ) = @_;
    
    my ($filtered_sequences, $failed_sequences, $unique_sequences);
    foreach my $sequence (@$sequences) {
    
        if ($sequence->is_unique) {
            $unique_sequences ||= [];
            push (@$unique_sequences, $sequence);
        } else {
            unless ($sequence->score_ratio) {
                confess "Bad score ratio for sequence id: ", $sequence->id;
            }
                        
            if ($sequence->score_ratio >= $cfg->criteria_score_ratio and 
                $sequence->percent_repetitive <= $cfg->criteria_percent_repetitive) {

                $filtered_sequences ||= []; 
                push (@$filtered_sequences, $sequence);
            } else {
                $failed_sequences ||= [];
                push (@$failed_sequences, $sequence);
            }
        }
    }
    return ($filtered_sequences, $failed_sequences, $unique_sequences);
}

sub add_score_ratio {
    my ( $sequence, $hits ) = @_;

    my $self_hit = $sequence->self_hit;
    
    my @component_hits;
    foreach my $hit (@$hits) {
        push (@component_hits, @{$hit->get_ComponentHits});
    }
    @component_hits = sort {$b->score <=> $a->score} @component_hits;
    
    unless ($self_hit == $component_hits[0]) {
        confess "Highest scoring hit is not the self_hit";
    }
    
    #Check we actually have more than just the self_hit. 
    if (@component_hits > 1) {
        $sequence->score_ratio($self_hit->score / $component_hits[1]->score);
        return 1;
    } else {
        print STDERR "Found a unique sequence\n" if $debug;
        $sequence->is_unique(1);
        return;
    }
}

### add_self_hit
#
# Should be able to parse the NCBIM33 files with lines like:
#   >1.1-195203927 RepeatMask Dust                           (on assembly)
#   >10_random_NT_081852.1-47794 RepeatMask Dust             (off assembly)
#
# And NCBIM34
#   >chromosome:NCBIM34:1:1:195109612:1 chromosome 1         (on assembly)
#   >supercontig::NT_039389:1:13453:1 supercontig NT_039389  (off assembly)
#

sub add_self_hit {
    my ( $sequence, $hits ) = @_;

    unless ($hits) {
        print STDERR "No hits for Seqence id: ", $sequence->id, "\n";
    }
    
    my ($found, $self_hit);    
    my $seq_id = $sequence->id;
    
    foreach my $hit (@$hits) {

        my $accession = $hit->accession;
        if (index($accession, '_') < 0) {
        
            my $chr;
            if ($accession =~ /^chromosome/) {
                my @fields = split(/:+/, $accession);
                $chr = $fields[2]
                    or confess "Could not parse chromosome from '$accession'";
            } else {
                $chr = substr($accession, 0, index($accession, '.'));
            }
                                
            if ($chr eq $sequence->chromosome) {

                my $component_hits = $hit->get_ComponentHits;             
                foreach my $component_hit (@$component_hits) {

                    if (is_self_hit($sequence, $component_hit)) {
                        $found++;
                        $self_hit = $component_hit
                    }
                }
            }
        } else {
            print STDERR "Hit is not on the golden path\n" if $debug;
        }
    }

    if ($found == 0) {
        print STDERR "Failed to find the self_hit for sequence id: $seq_id\n";
    } elsif ($found == 1) {
        print STDERR "Single self_hit found for for sequence id: $seq_id\n"
            if $debug;
        $sequence->self_hit($self_hit);
        return 1;
    } elsif ($found > 1) {
        confess "Found more than one self_hit for sequence id: $seq_id\n";   
    }
}

sub is_self_hit {
    my ($sequence, $component_hit) = @_;
    
    #This one is dodgy
    return unless $component_hit->query_strand == $component_hit->subject_strand;
    return unless abs($sequence->start - $component_hit->subject_start) < 10;
    return unless abs($sequence->end - $component_hit->subject_end) < 10;
    return unless $component_hit->percent_id >= 98;
    return 1;
}

sub get_adaptors {
    my ( $dba ) = @_;

    my $probe_aptr = $dba->get_DNAProbeAdaptor()
        or confess "Could not get DNAProbeAdaptor";
    my $seq_aptr   = $dba->get_SequenceAdaptor()
        or confess "Could not get SequenceAdaptor";
    my $hit_aptr   = $dba->get_HitAdaptor()
        or confess "Could not get HitAdaptor";
    my $conf_aptr  = $dba->get_ConfAdaptor()
        or confess "Could not get ConfAdaptor";
    my $job_aptr  = $dba->get_JobAdaptor()
        or confess "Could not get JobAdaptor";
    
    return ($probe_aptr, $seq_aptr, $hit_aptr, $conf_aptr, $job_aptr);
}


sub configure_parameters {

    my $mandatory = {};
    my $optional  = {};

    $mandatory->{'analysis_database'} = ['db', 'host', 'user'];
     $optional->{'analysis_database'} = ['pass', 'port'];
    
    $mandatory->{criteria} = ['score_ratio', 'percent_repetitive'];

    $mandatory->{'output_dir'}       = ['loc'];

    $mandatory->{'primer3'} = ['PRIMER_MIN_SIZE', 'PRIMER_MAX_SIZE'
        , 'PRIMER_MIN_GC', 'PRIMER_MAX_GC', 'PRIMER_MIN_TM', 'PRIMER_MAX_TM'
        , 'PRIMER_NUM_RETURN', 'PRIMER_SALT_CONC'];
        
     $optional->{'primer3'} = ['PRIMER_OPT_SIZE', 'PRIMER_OPT_GC_PERCENT', 'PRIMER_GC_CLAMP'];
    return ($mandatory, $optional);
}
