#!/usr/bin/perl -w
#
# Perl script for computing kernel matrices using the coiled coil kernel
# (c) 2011, Ulrich Bodenhofer, Institute of Bioinformatics, Johannes
#     Kepler University, Linz, Austria
#
# If you use this program, please cite:
#
#     C. C. Mahrenholz, I. G. Abfalter, U. Bodenhofer, R. Volkmer, and
#     S. Hochreiter. Complex networks govern coiled coil oligomerization -
#     predicting and profiling by means of a machine learning approach.
#     Mol. Cell. Proteomics 10(5):M110.004994, 2011.
#     DOI: 10.1074/mcp.M110.004994
#
# License: GPL2
#

use strict;
use warnings;

PrintUsage() unless (@ARGV);

my $norm = 1;
my $m = 7;
my $format = "CSVH";
my $trainfile = "";
my $testfile = "";

while (@ARGV)
{
    if ($ARGV[0] =~ /^\-norm=(0|1)$/)
    {
	$norm = $1;
	shift(@ARGV);
    }
    elsif ($ARGV[0] =~ /^\-m=(\d+)$/)
    {
	$m = $1;

	die "Choose m between 0 and 20\n" if ($m > 20);

	shift(@ARGV);
    }
    elsif ($ARGV[0] =~ /^\-output=(LIBSVM|CSV|CSVH)$/)
    {
	$format = $1;
	shift(@ARGV);
    }
    elsif ($ARGV[0] =~ /^\-/)
    {
	die "Invalid option $ARGV[0]\n";
    }
    elsif ($trainfile eq "")
    {
	$trainfile = $ARGV[0];
	shift(@ARGV);
    }
    elsif ($testfile eq "")
    {
	$testfile = $ARGV[0];
	shift(@ARGV);
    }
    else
    {
	warn "Argument $ARGV[0] ignored\n";
	shift(@ARGV);
    }
}

die "No training file name specified\n" if ($trainfile eq "");

my $trainNum = 0;
my @trainSet;

die "Could not open $trainfile\n" unless (open(TRAIN, "<$trainfile"));

while (my $line = <TRAIN>)
{
    chomp $line;

    if ($line =~ /^([^,]+),([A-Z]+),([a-g]+),(DIMER|TRIMER)$/)
    {
	$trainSet[$trainNum] = {ID => $1, SEQ => $2, REG => $3, LABEL => $4};
	$trainNum++;
    }
}

close(TRAIN);

print STDERR "Training data: $trainNum samples read\n";

my $testNum = 0;
my @testSet;

if ($testfile)
{
    die "Could not open $testfile\n" unless (open(TEST, "<$testfile"));

    while (my $line = <TEST>)
    {
        chomp $line;

        if ($line =~ /^([^,]+),([A-Z]+),([a-g]+),(DIMER|TRIMER)$/)
        {
	    $testSet[$testNum] = {ID => $1, SEQ => $2, REG => $3, LABEL => $4};
	    $testNum++;
        }
    }

    close(TEST);

    print STDERR "Test data: $testNum samples read\n";
}


## now compute matrix

my @K;
my @trainSelf;

for (my $i = 0; $i < @trainSet; $i++)
{
    $trainSelf[$i] = CoiledCoilKernelSelf($trainSet[$i]->{SEQ},
					  $trainSet[$i]->{REG});
}

if ($testfile)
{
    for (my $i = 0; $i < @testSet; $i++)
    {
        my $testself = CoiledCoilKernelSelf($testSet[$i]->{SEQ},
					    $testSet[$i]->{REG});

	for (my $j = 0; $j < @trainSet; $j++)
	{
	    $K[$i][$j] = CoiledCoilKernel($testSet[$i]->{SEQ},
					  $testSet[$i]->{REG},
					  $trainSet[$j]->{SEQ},
					  $trainSet[$j]->{REG}) /
			 ($norm ? (sqrt($testself) *
				   sqrt($trainSelf[$j])) : 1);
	}
    }    
}
else
{
    for (my $i = 0; $i < @trainSet; $i++)
    {
	if ($norm)
	{
	    $K[$i][$i] = 1;
	}
	else
	{
	    $K[$i][$i] = $trainSelf[$i];
	}

	for (my $j = $i + 1; $j < @trainSet; $j++)
	{
	    $K[$i][$j] = CoiledCoilKernel($trainSet[$i]->{SEQ},
					  $trainSet[$i]->{REG},
					  $trainSet[$j]->{SEQ},
					  $trainSet[$j]->{REG}) /
			 ($norm ? (sqrt($trainSelf[$i]) *
				   sqrt($trainSelf[$j])) : 1);

	    $K[$j][$i] = $K[$i][$j];
	}
    }

    @testSet = @trainSet;
}

if ($format eq "LIBSVM")
{
    WriteLIBSVM();
}
elsif ($format eq "CSV")
{
    WriteCSV(0);
}
elsif ($format eq "CSVH")
{
    WriteCSV(1);
}

    
sub PrintUsage
{
    print "\n";
    print "Usage: perl CoiledCoilKernel.pl [options] trainingfile [testfile]\n";

    print "\nIf only a training file is specified, this script reads it\n";
    print "and computes a quadratic kernel matrix using the coiled coil\n";
    print "kernel. The choice of m and whether kernel normalization should\n";
    print "be applied can be controlled by options (see below).\n";
    print "If a test data file is specified additionally, a matrix of\n";
    print "kernel values of test samples (corresponding to rows) versus\n";
    print "training samples (corresponding to columns) is computed.\n";
    print "Output is written to STDOUT; the output format can be chosen\n";
    print "using the -output option (see below).\n\n";

    print "Options: -norm=(0|1) .. determines whether kernel normalization\n";
    print "                        should be applied (default: 1)\n";
    print "         -m=[int] .. determines the parameter m of the coiled\n";
    print "                     coiled kernel, valid values range from 0 to\n";
    print "                     20 (default: 7)\n";
    print "         -format=[str] .. determines the output format; valid\n";
    print "                          values are (default: CSVH):\n";
    print "                             LIBSVM .. LIBSVM input format\n"; 
    print "                             CSV    .. comma-separated table\n"; 
    print "                             CSVH   .. comma-separated table\n"; 
    print "                                       with headers\n\n";

    print "Required input format: CSV with four columns:\n";
    print "   Column 1: identifier/name of sequence\n";
    print "   Column 2: amino acid sequence\n";
    print "   Column 3: heptad register\n";
    print "   Column 4: oligomerization (DIMER or TRIMER)\n\n";

    exit(0);
}

sub CoiledCoilKernel
{
    my ($seq1, $reg1, $seq2, $reg2) = @_;

    my ($n1, $n2) = (length($seq1), length($seq2));

    my %hash1;
    my %hash2;

    my $dotfill = '';

    for (my $l = 0; $l <= $m; $l++)
    {
        for (my $i = 0; $i < $n1 - $l - 1; $i++)
	{
	    my $key = substr($seq1, $i, 1) . $dotfill .
	              substr($seq1, $i + $l + 1, 1) . substr($reg1, $i, 1);

	    $hash1{$key}++;
	}

	$dotfill .= '.';
    }

    $dotfill = '';

    for (my $l = 0; $l <= $m; $l++)
    {
        for (my $i = 0; $i < $n2 - $l - 1; $i++)
	{
	    my $key = substr($seq2, $i, 1) . $dotfill .
	              substr($seq2, $i + $l + 1, 1) . substr($reg2, $i, 1);

	    $hash2{$key}++ if (exists($hash1{$key}));
	}

	$dotfill .= '.';
    }

    my $ret = 0;

    foreach my $key (keys(%hash2))
    {
        $ret += ($hash1{$key} * $hash2{$key});
    }

    return $ret;
}

sub CoiledCoilKernelSelf
{
    my ($seq, $reg) = @_;

    my $n = length($seq);

    my %hash;

    my $dotfill = '';

    for (my $l = 0; $l <= $m; $l++)
    {
        for (my $i = 0; $i < $n - $l - 1; $i++)
	{
	    my $key = substr($seq, $i, 1) . $dotfill .
	              substr($seq, $i + $l + 1, 1) . substr($reg, $i, 1);

	    $hash{$key}++;
	}

	$dotfill .= '.';
    }

    my $ret = 0;

    foreach my $key (keys(%hash))
    {
        $ret += ($hash{$key} ** 2);
    }

    return $ret;
}


sub WriteCSV
{
    my ($headers) = @_;

    if ($headers)
    {
	my @trainheaders;

	for (my $i = 0; $i < @trainSet; $i++)
	{
	    $trainheaders[$i] = $trainSet[$i]->{ID};
	}

	print "Class,", join(",", @trainheaders), "\n";
    }

    for (my $i = 0; $i < @testSet; $i++)
    {
	print "$testSet[$i]->{ID}," if ($headers);
	print ($testSet[$i]->{LABEL} eq "TRIMER" ? "+1" : "-1");

	for (my $j = 0; $j < @trainSet; $j++)
	{
	    print ",$K[$i][$j]";
	}

	print "\n";
    }
}

sub WriteLIBSVM
{
    for (my $i = 0; $i < @testSet; $i++)
    {
	print ($testSet[$i]->{LABEL} eq "TRIMER" ? "+1" : "-1");
	print " 0:", $i + 1;

	for (my $j = 0; $j < @trainSet; $j++)
	{
	    print " ", $j + 1, ":$K[$i][$j]";
	}

	print "\n";
    }
}
