#!/usr/bin/env perl

#########################################
#
# slp: precluster nearest neighbor, 2%
#
# Author: Susan Huse, shuse@mbl.edu
#
# Date: Thu May 14 13:43:16 EDT 2009
#
# Copyright (C) 2009 Marine Biological Laborotory, Woods Hole, MA
# 
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# For a copy of the GNU General Public License, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
# or visit http://www.gnu.org/copyleft/gpl.html
#
# Keywords: otu cluster distance pairwise dotur mothur precluster single linkage nearest neighbor
# 
# Assumptions: 
#
# Revisions:
#
# Programming Notes:
#
########################################
use strict;
use warnings;

#######################################
#
# Set up usage statement
#
#######################################
my $script_help = "
 slp - clusters reads into OTUs using the single linkage (nearest neighbor)
       test.  Reads are first sorted by most frequent, then clustered.

       Requires a pairwise distance file and a names file
       Currently distance file should be 3 columns, not in a matrix format (distmatrix2col).
 
\n";

my $usage = "
   Usage:  slp -d distancefile -f fastafile -n namesfile -w clusterwidth > out.txt
      ex:  slp -n entero.names -d entero.fa.m2.dist3 -w 0.03 

 Options:  
           -d  distance file, in three-column format with pairwise distances for each OTU
           -f  fasta file corresponding to the names file
           -n  names file, with lookups to duplicates for ids in the distance file
           -w  clustering width (e.g., unique, 0.03, 0.06, 0.10)
           -p  write out to log file pairs that are wider than a specified clustering width (e.g., 0.05)
           -i  otu size for iterate clustering (all otus smaller than <= otu_size will be double-checked)
           -o  output file prefix [default = distance file prefix (s/\.dist/.nn02.list .slp.unique.fa, ./slp.names)
\n";

#######################################
#
# Definition statements
#
#######################################
#Commandline parsing
#my $arg_count = 3;
my $min_arg_count = 2;
my $max_arg_count = 4;
my $verbose = 0;

#Runtime variables
my $cluster_filename;
my $distance_filename;
my $fasta_filename;
my $names_filename;
my $otu_width;
my $names;
my $names_index = 1;
my $iteration_size = 10;
my $log_pairs = 0;
my $file_prefix;

#######################################
#
# Test for commandline arguments
#
#######################################

if (! $ARGV[0] ) {
	print $script_help;
	print $usage;
	exit -1;
} 

while ((scalar @ARGV > 0) && ($ARGV[0] =~ /^-/)) {
	if ($ARGV[0] =~ /-h/) {
		print $script_help;
		print $usage;
		exit 0;
	} elsif ($ARGV[0] eq "-d") {
		shift @ARGV;
		$distance_filename = shift @ARGV;
	} elsif ($ARGV[0] eq "-n") {
		shift @ARGV;
		$names_filename = shift @ARGV;
        $names = 1;
	} elsif ($ARGV[0] eq "-f") {
		shift @ARGV;
		$fasta_filename = shift @ARGV;
	} elsif ($ARGV[0] eq "-w") {
		shift @ARGV;
		$otu_width = shift @ARGV;
	} elsif ($ARGV[0] eq "-o") {
		shift @ARGV;
		$file_prefix = shift @ARGV;
	} elsif ($ARGV[0] eq "-i") {
		shift @ARGV;
		$iteration_size = shift @ARGV;
	} elsif ($ARGV[0] eq "-p") {
		shift @ARGV;
		$log_pairs = shift @ARGV;
	} elsif ($ARGV[0] eq "-v") {
		$verbose = 1;
		shift @ARGV;
	} elsif ($ARGV[0] =~ /^-/) { #unknown parameter, just get rid of it
		print "Unknown commandline flag \"$ARGV[0]\".\n";
		print $usage;
		exit -1;
	}
}


#######################################
#
# Parse commandline arguments, ARGV
#
#######################################

if ( (! $fasta_filename) || (! $names_filename) || (! $distance_filename) || (! $otu_width) ) {
	print "Incorrect number of arguments.\n";
	print "$usage\n";
	exit;
} 

#Test validity of commandline arguments
if (! -f $distance_filename) {
	print "Unable to locate input distance file: $distance_filename.\n";
	exit -1;
}

if (! -f $fasta_filename) {
	print "Unable to locate input fasta file: $fasta_filename.\n";
	exit -1;
}

if (! -f $names_filename) {
	print "Unable to locate input names file: $names_filename.\n";
	exit -1;
}

if (! $file_prefix) 
{
    $file_prefix = $distance_filename;
    $file_prefix =~ s/\.[^\.]+$//;
}

my $out_list_filename = $otu_width;
$out_list_filename =~ s/^.*\.//;
$out_list_filename = $file_prefix . ".nn" . $out_list_filename . ".list";

#######################################
#
# Open the files
#
#######################################
if ($verbose) {system("date"); print "Opening names and distance files\n";}

# open them early so user doesn't have to wait to find out that they don't open
open(DIST, "<$distance_filename") || die("Unable to read from file: $distance_filename.  Exiting.\n");
open(FASTA, "<$fasta_filename") || die("Unable to read from file: $fasta_filename.  Exiting.\n"); 
open(NAMES, "<$names_filename") || die("Unable to read from file: $names_filename.  Exiting.\n"); 

if ($log_pairs) 
{
    (my $pairs_filename = $otu_width) =~ s/^0\.//;
    $pairs_filename = "$cluster_filename.$pairs_filename.pairs";
    open(PAIRS, ">$pairs_filename") || die("Unable to write to pairs file: $pairs_filename.  Exiting.\n");
}

#######################################
#
# Load up the pairwise distances
#
#######################################
if ($verbose) {system("date"); print "Loading distance matrix\n";}
my %distance_between;
while (my $line = <DIST>)
{
    chomp $line;
    my @data = split(/\s+/, $line);

    # to save memory, you only need those distances that are within the otu threshold, 
    # don't bother to save the rest.
    if ($data[2] > $otu_width) {next;}

    # store the distance, but use the read order so I can find them again.
    if ($data[0] lt $data[1]) 
    {
        #$distance_between{$data[0]}{$data[1]} = $data[2];
        $distance_between{$data[0]}{$data[1]}++;
    } else {
        #$distance_between{$data[1]}{$data[0]} = $data[2];
        $distance_between{$data[1]}{$data[0]}++;
    }
}

close(DIST);
if ($verbose) {system("date"); my $dc = scalar keys %distance_between; print "Distances loaded: $dc\n";}

#######################################
#
# Get the index read and frequencies
#
#######################################
if ($verbose) {system("date"); print "Loading frequencies from names file\n";}
my %frequency_of;

while (my $line = <NAMES>)
{
    # [0] is the indexing read name, [1] is comma-separated duplicate read names
    chomp $line;
    my @data = split(/\s+/, $line);

    # Store the frequency for ordering by frequency
    # tr gives the number of commas, need to +1 for the number of reads
    my $read_count = $data[1] =~ tr/,/,/;
    $frequency_of{$data[0]} = $read_count + 1;
}
close(NAMES);

#######################################
#
# Step through the unique reads and create the new clusters
#
#######################################
if ($verbose) {system("date"); print "Creating initial clusters\n";}
my @otus;
my %otu_abundance;

# Step through each read, starting with the most frequent
foreach my $new_read (sort { $frequency_of{$b} <=> $frequency_of{$a} } ( keys(%frequency_of) ) ) 
{
    my $start_clusters = 1;
    my $has_neighbor = 0;
    my $tester = 0;

    #print join(", ", $new_read, $frequency_of{$new_read}) . "\n";
    # Check the read against each existing OTU
    foreach my $i ( 0 .. $#otus ) 
    {
        foreach my $old_read ( @{ $otus[$i] } )
        {
            # Lookup the pairwise distance
            my $d;
            my $od;

            # distances are stored in hash with {smaller id}{larger id}
            if ($new_read lt $old_read)
            {
                if ( (exists $distance_between{$new_read}) && (exists $distance_between{$new_read}{$old_read}) ) 
                {
                    # already tested distance, and only stored the near ones
                    #if ( $distance_between{$new_read}{$old_read} < $otu_width ) {$has_neighbor = 1;}
                    $has_neighbor = 1;
                } 

            } else { 
                # ($new_read gt $old_read)
                if ( (exists $distance_between{$old_read}) && (exists $distance_between{$old_read}{$new_read}) ) 
                {
                    # already tested distance, and only stored the near ones
                    #if ( $distance_between{$old_read}{$new_read} < $otu_width ) {$has_neighbor = 1;}
                    $has_neighbor = 1;
                } 
            }
            
            if ($has_neighbor) 
            {
                # Add to the existing OTU
                #print "Add $new_read to $i, at $d from $old_read\n";
                push @{ $otus[$i] }, $new_read;
                $otu_abundance{$i} += $frequency_of{$new_read};

                # if added to the OTU, stop checking the remaining reads
                last;
            }
        }

        # If added to an OTU, quit checking OTUs
        if ($has_neighbor) { last; }
    }

    # Did not get into an existing OTU, create a new one
    if (! $has_neighbor) 
    { 
        push @otus, [ $new_read ];  
        $otu_abundance{$#otus} = $frequency_of{$new_read};
    }
}

#######################################
#
# Re-iterate the low abundance OTUs to be sure that the order they were read did not 
# preclude them from coming into an OTU (the outlying rare is clustered before the intermediary rare)
#
####################################### 
if ($verbose) {system("date"); print "Iterating clusters\n";}
my %lost_otus;

foreach my $i ( 0 .. $#otus ) 
{
    my $has_neighbor = 0;

    # only double check the small ones
    if ($otu_abundance{$i} > $iteration_size) {next;}

    # for each read in the OTU
    foreach my $test_read ( @{$otus[$i]} )
    {
        # for each other OTU
        foreach my $j (0 .. $#otus) 
        {
            if ($i == $j) {next;} # Skip self
            if (exists $lost_otus{$j}) {next;} # Skip OTUs that have already been absorbed

            # for each read in each other OTU
            foreach my $old_read (@{$otus[$j]})
            {
                if ($test_read lt $old_read)
                {
                    if ( (exists $distance_between{$test_read}) && (exists $distance_between{$test_read}{$old_read}) ) 
                    {
                        #if ( $distance_between{$test_read}{$old_read} < $otu_width ) {$has_neighbor = 1;}
                        $has_neighbor = 1;
                        #warn join(", ", $test_read, $old_read, $distance_between{$test_read}{$old_read}) . "\n";
                    } 
    
                } else { 
                    # ($test_read gt $old_read)
                    if ( (exists $distance_between{$old_read}) && (exists $distance_between{$old_read}{$test_read}) ) 
                    {
                        #if ( $distance_between{$old_read}{$test_read} < $otu_width ) {$has_neighbor = 1;}
                        $has_neighbor = 1;
                        #warn join(", ", $test_read, $old_read, $distance_between{$old_read}{$test_read}) . "\n";
                    } 
                }

                if ($has_neighbor) {last;}
            }

            # if found a neighbor, dump this into the other OTU
            if ($has_neighbor) 
            {
                # Add to the old OTU
                push @{ $otus[$j] }, @{ $otus[$i] };

                # Remove it from the list
                $lost_otus{$i}++;
                last;
            } 

                
        }
        if ($has_neighbor) {last;} 
    }
}

# Remove the lost otus from the list, start at the back to preserve indexing
my @final_otus;
foreach my $i (0 .. $#otus) 
{
    if (! exists $lost_otus{$i} ) 
    {
        push @final_otus, [ @{$otus[$i]} ]; 
    }
}

#######################################
#
# Free up the memory from the distances
#
#######################################

undef %distance_between;
            
#######################################
#
# Load up the names 
#
#######################################
if ($verbose) {system("date"); print "Loading names file\n";}
my %copies_of;

open(NAMES, "<$names_filename") || die("Unable to read from file: $names_filename.  Exiting.\n"); 
while (my $line = <NAMES>)
{
    # [0] is the indexing read name, [1] is comma-separated duplicate read names
    chomp $line;
    my @data = split(/\s+/, $line);
    my $index_read = $data[0];

    # Store the copies for output
    $copies_of{$index_read} = $data[1];

}
close(NAMES);

#######################################
#
# Print out the final clusters
#
#######################################
if ($verbose) {system("date"); print "Exporting clusters\n";}
open(OUT_PRECLUSTER, ">$out_list_filename") || die("Unable to write to output slp file: $out_list_filename. Exiting.\n"); 

# Print out the otu width and count
print OUT_PRECLUSTER join("\t", $otu_width, scalar @final_otus);

# Print out the otus
foreach my $o (@final_otus) 
{ 
    print OUT_PRECLUSTER "\t";  # tab separates otus

    # commas separate reads within an otu
    # stupid kludge for not printing out the extra comma, brain is fried
    # need to maintain the order of the otus, though
    foreach my $j (0 .. scalar @{$o} - 2) { print OUT_PRECLUSTER $copies_of{@{$o}[$j]} . ","; }
    print OUT_PRECLUSTER $copies_of{ @{$o}[scalar @{$o} -1] };
}
print OUT_PRECLUSTER "\n";

#######################################
#
# Load up the fasta sequences 
#
#######################################
if ($verbose) {system("date"); print "Loading fasta file\n";}
my %sequence_of;
my $read_id = "";
my $seq = "";

while (my $line = <FASTA>)
{
    chomp $line;

    # read defline
    if ($line =~ /^>/)
    {
        # Load up the previous sequence if there was one.
        if ( ($read_id) && ($seq) ) { $sequence_of{$read_id} = $seq; }
        $seq = "";  # reinitialize the sequence

        # Get the new read id
        $read_id = $line;
        $read_id =~ s/^>//; # remove > from the beginning
        $read_id =~ s/\s+.*$//;  #take only the first word
    } else {
        $seq .= $line;
    }
}
# Load up the last sequence if there was one.
if ( ($read_id) && ($seq) ) { $sequence_of{$read_id} = $seq; }

#######################################
#
# Print out the seeds fasta
#
#######################################
if ($verbose) {system("date"); print "Exporting fasta\n";}
open(OUT_FASTA, ">$file_prefix.slp.unique.fa") || die("Unable to write to output fasta file: $file_prefix.unique.fa.  Exiting.\n"); 
open(OUT_NAMES, ">$file_prefix.slp.names") || die("Unable to write to output names file: $file_prefix.names.  Exiting.\n"); 

# For each otu
foreach my $o (@final_otus)
{
    # Write out the new fasta and names entries
    # the sequences were added to the otu array in frequency order, so use the first as the most abundant.
    my $seed = @{$o}[0];
    print OUT_FASTA ">$seed\n" . $sequence_of{$seed} . "\n";
    #print OUT_NAMES "$seed\t" . join(',', @{$o} ) . "\n";

    print OUT_NAMES "$seed\t";
    my @all_dupes;
    foreach my $read ( @{$o} )  # for each read in uniques fasta
    {
        # get the copies in the names file
        push @all_dupes,  split(/,/, $copies_of{$read} );
    }
    print OUT_NAMES join(',', @all_dupes ) . "\n";
}
close(OUT_FASTA);
close(OUT_NAMES);

#######################################
#
# Close out
#
#######################################
if ($log_pairs) { close(PAIRS); }