/*****************************************************
 Amino Acid Preference Toolkit in Java
 Pathogen Project
 Department of Computer Science and Engineering
 University of South Carolina
 Columbia, SC 29208
 Contact Email: rose@cse.sc.edu
*****************************************************/

import java.util.*;
import java.util.zip.*;

public class SequenceUtilities
{

    Hashtable codonToAminoAcidMap;
    Hashtable codonToIndexMap;
    Deflater compresser;
    Random random;

    public SequenceUtilities()
    {
	codonToIndexMap =  new Hashtable();
	codonToAminoAcidMap = new Hashtable();
	codonToIndexMap.put("CGA", new Integer(0));
	codonToIndexMap.put("CGC", new Integer(1));
	codonToIndexMap.put("CGG", new Integer(2));
	codonToIndexMap.put("CGT", new Integer(3));
	codonToIndexMap.put("AGA", new Integer(4));
	codonToIndexMap.put("AGG", new Integer(5));
	codonToIndexMap.put("CTA", new Integer(6));
	codonToIndexMap.put("CTC", new Integer(7));
	codonToIndexMap.put("CTG", new Integer(8));
	codonToIndexMap.put("CTT", new Integer(9));
	codonToIndexMap.put("TTA", new Integer(10));
	codonToIndexMap.put("TTG", new Integer(11));
	codonToIndexMap.put("TCA", new Integer(12));
	codonToIndexMap.put("TCC", new Integer(13));
	codonToIndexMap.put("TCG", new Integer(14));
	codonToIndexMap.put("TCT", new Integer(15));
	codonToIndexMap.put("AGC", new Integer(16));
	codonToIndexMap.put("AGT", new Integer(17));
	codonToIndexMap.put("ACA", new Integer(18));
	codonToIndexMap.put("ACC", new Integer(19));
	codonToIndexMap.put("ACG", new Integer(20));
	codonToIndexMap.put("ACT", new Integer(21));
	codonToIndexMap.put("CCA", new Integer(22));
	codonToIndexMap.put("CCC", new Integer(23));
	codonToIndexMap.put("CCG", new Integer(24));
	codonToIndexMap.put("CCT", new Integer(25));
	codonToIndexMap.put("GCA", new Integer(26));
	codonToIndexMap.put("GCC", new Integer(27));
	codonToIndexMap.put("GCG", new Integer(28));
	codonToIndexMap.put("GCT", new Integer(29));
	codonToIndexMap.put("GGA", new Integer(30));
	codonToIndexMap.put("GGC", new Integer(31));
	codonToIndexMap.put("GGG", new Integer(32));
	codonToIndexMap.put("GGT", new Integer(33));
	codonToIndexMap.put("GTA", new Integer(34));
	codonToIndexMap.put("GTC", new Integer(35));
	codonToIndexMap.put("GTG", new Integer(36));
	codonToIndexMap.put("GTT", new Integer(37));
	codonToIndexMap.put("AAA", new Integer(38));
	codonToIndexMap.put("AAG", new Integer(39));
	codonToIndexMap.put("AAC", new Integer(40));
	codonToIndexMap.put("AAT", new Integer(41));
	codonToIndexMap.put("CAA", new Integer(42));
	codonToIndexMap.put("CAG", new Integer(43));
	codonToIndexMap.put("CAC", new Integer(44));
	codonToIndexMap.put("CAT", new Integer(45));
	codonToIndexMap.put("GAA", new Integer(46));
	codonToIndexMap.put("GAG", new Integer(47));
	codonToIndexMap.put("GAC", new Integer(48));
	codonToIndexMap.put("GAT", new Integer(49));
	codonToIndexMap.put("TAC", new Integer(50));
	codonToIndexMap.put("TAT", new Integer(51));
	codonToIndexMap.put("TGC", new Integer(52));
	codonToIndexMap.put("TGT", new Integer(53));
	codonToIndexMap.put("TTC", new Integer(54));
	codonToIndexMap.put("TTT", new Integer(55));
	codonToIndexMap.put("ATA", new Integer(56));
	codonToIndexMap.put("ATC", new Integer(57));
	codonToIndexMap.put("ATT", new Integer(58));
	codonToIndexMap.put("ATG", new Integer(59));
	codonToIndexMap.put("TGG", new Integer(60));
	codonToIndexMap.put("TAA", new Integer(61));
	codonToIndexMap.put("TAG", new Integer(62));
	codonToIndexMap.put("TGA", new Integer(63));



	codonToAminoAcidMap.put("CGA", "R");
	codonToAminoAcidMap.put("CGC", "R");
	codonToAminoAcidMap.put("CGG", "R");
	codonToAminoAcidMap.put("CGT", "R");
	codonToAminoAcidMap.put("AGA", "R");
	codonToAminoAcidMap.put("AGG", "R");
	codonToAminoAcidMap.put("CTA", "L");
	codonToAminoAcidMap.put("CTC", "L");
	codonToAminoAcidMap.put("CTG", "L");
	codonToAminoAcidMap.put("CTT", "L");
	codonToAminoAcidMap.put("TTA", "L");
	codonToAminoAcidMap.put("TTG", "L");
	codonToAminoAcidMap.put("TCA", "S");
	codonToAminoAcidMap.put("TCC", "S");
	codonToAminoAcidMap.put("TCG", "S");
	codonToAminoAcidMap.put("TCT", "S");
	codonToAminoAcidMap.put("AGC", "S");
	codonToAminoAcidMap.put("AGT", "S");
	codonToAminoAcidMap.put("ACA", "T");
	codonToAminoAcidMap.put("ACC", "T");
	codonToAminoAcidMap.put("ACG", "T");
	codonToAminoAcidMap.put("ACT", "T");
	codonToAminoAcidMap.put("CCA", "P");
	codonToAminoAcidMap.put("CCC", "P");
	codonToAminoAcidMap.put("CCG", "P");
	codonToAminoAcidMap.put("CCT", "P");
	codonToAminoAcidMap.put("GCA", "A");
	codonToAminoAcidMap.put("GCC", "A");
	codonToAminoAcidMap.put("GCG", "A");
	codonToAminoAcidMap.put("GCT", "A");
	codonToAminoAcidMap.put("GGA", "G");
	codonToAminoAcidMap.put("GGC", "G");
	codonToAminoAcidMap.put("GGG", "G");
	codonToAminoAcidMap.put("GGT", "G");
	codonToAminoAcidMap.put("GTA", "V");
	codonToAminoAcidMap.put("GTC", "V");
	codonToAminoAcidMap.put("GTG", "V");
	codonToAminoAcidMap.put("GTT", "V");
	codonToAminoAcidMap.put("AAA", "K");
	codonToAminoAcidMap.put("AAG", "K");
	codonToAminoAcidMap.put("AAC", "N");
	codonToAminoAcidMap.put("AAT", "N");
	codonToAminoAcidMap.put("CAA", "Q");
	codonToAminoAcidMap.put("CAG", "Q");
	codonToAminoAcidMap.put("CAC", "H");
	codonToAminoAcidMap.put("CAT", "H");
	codonToAminoAcidMap.put("GAA", "E");
	codonToAminoAcidMap.put("GAG", "E");
	codonToAminoAcidMap.put("GAC", "D");
	codonToAminoAcidMap.put("GAT", "D");
	codonToAminoAcidMap.put("TAC", "Y"); 
	codonToAminoAcidMap.put("TAT", "Y"); 
	codonToAminoAcidMap.put("TGC", "C"); 
	codonToAminoAcidMap.put("TGT", "C"); 
	codonToAminoAcidMap.put("TTC", "F");
	codonToAminoAcidMap.put("TTT", "F");
	codonToAminoAcidMap.put("ATA", "I");
	codonToAminoAcidMap.put("ATC", "I");
	codonToAminoAcidMap.put("ATT", "I"); 
	codonToAminoAcidMap.put("ATG", "M"); 
	codonToAminoAcidMap.put("TGG", "W"); 
	codonToAminoAcidMap.put("TAA", "Z"); 
	codonToAminoAcidMap.put("TAG", "Z");
	codonToAminoAcidMap.put("TGA", "Z");

	random = new Random();
	compresser = new Deflater();
    }


    public double getCompressionRatio(char[] sequence)
	{
		// Encode a String into bytes
		String inputString = new String(sequence);
		byte[] input = null;
		try
		{
 		
	 	 input = inputString.getBytes("UTF-8");

 		
		}
		catch (Exception exception)
		{
			System.out.println("SequenceUtilities.getCompressionRatio():" + exception);
			exception.printStackTrace();
		}	
		// Compress the bytes
		byte[] output = new byte[sequence.length];
 	
		compresser.setInput(input);
 		compresser.finish();	
 		int compressedDataLength = compresser.deflate(output);
		compresser.reset();

		return ((double)(compressedDataLength) / (double)(sequence.length));
	}

   public double getCompressionRatio(char[] sequence, int startIndex, int stopIndex)
	{
		String inputString = new String(sequence);
		char[] inputSubstring = inputString.substring(startIndex, (stopIndex+1)).toCharArray();
		return getCompressionRatio(inputSubstring);
	}
    
    public int[] getNucleotideCounts(char[] sequence)
    {
	int length = sequence.length;
	int[] nucleotideCounts = new int[4];
	char inChar;
	
	for (int j = 0; j < 4; j++)
	    {
		nucleotideCounts[j] = 0;
	    }

	for (int i = 0; i < length; i++)
	    {
		inChar = sequence[i];
		if (inChar == 'A') {nucleotideCounts[0] = nucleotideCounts[0] + 1;}
		else if (inChar == 'C') {nucleotideCounts[1] = nucleotideCounts[1] + 1;}
		else if (inChar == 'G') {nucleotideCounts[2] = nucleotideCounts[2] + 1;}
		else if (inChar == 'T') {nucleotideCounts[3] = nucleotideCounts[3] + 1;}
	    }

	return nucleotideCounts;
    }

   public int[][] getPositionalNucleotideCounts(char[] sequence, int start, int stop)
   {
	int[][] positionalCounts = new int[4][3];
	char inChar1, inChar2, inChar3;
	int inChar1Int, inChar2Int, inChar3Int;

	for (int i = 0; i < 4; i++)
	{	
		for (int j = 0; j < 3; j++)
	    	{
			positionalCounts[i][j] = 0;
	    	}	
	}

	stop = stop - 2;
	for (int i = start; i <= stop; i = i + 3)
	{
		inChar1 = sequence[i];
		inChar2 = sequence[i+1];
		inChar3 = sequence[i+2];
		inChar1Int = getIndexForNucleotide(inChar1);
		inChar2Int = getIndexForNucleotide(inChar2);
		inChar3Int = getIndexForNucleotide(inChar3);
		if ((inChar1Int) >= 0) positionalCounts[inChar1Int][0] = positionalCounts[inChar1Int][0] + 1;
		if ((inChar2Int) >= 0) positionalCounts[inChar2Int][1] = positionalCounts[inChar2Int][1] + 1;
		if ((inChar3Int) >= 0) positionalCounts[inChar3Int][2] = positionalCounts[inChar3Int][2] + 1;
	}


	return positionalCounts;
   }


   public int getIndexForNucleotide(char inChar)
	{

		int randomValue;

		if (inChar == 'A') return 0;
		else if (inChar == 'C') return 1;
		else if (inChar == 'G') return 2;
		else if ((inChar == 'T') || (inChar == 'U')) return 3;
		else if (inChar == 'N')
	    	{
			randomValue = random.nextInt(4);
			return randomValue;
	    	}
		else if (inChar == 'Y')
	    	{
			randomValue = random.nextInt(2);
			if (randomValue == 0) return 1;
			else return 3;
	    	}
		else if (inChar == 'R')
		{
			randomValue = random.nextInt(2);
			if (randomValue == 0) return 0;
			else return 2;
		}
		else return -1;
	}

   public double[] getHexamerFrequencies(char[] sequence, int startPosition, int endPosition)
	{
		double[] hexamerFrequencies = new double[4096];
		String hexamer;
		int index;
		int totalHexamers = 0;

		for (int i = startPosition; i <= endPosition -5; i= i+6)
		{
			hexamer = "";
			if ((i+5) <= endPosition)
			{
				hexamer = hexamer + sequence[i] + sequence[i+1] + sequence[i+2] + sequence[i+3] + sequence[i+4] + sequence[i+5];
				index = getHexamerIndex(hexamer);
				if (index > -1)
				{
					totalHexamers++;
					hexamerFrequencies[index] = hexamerFrequencies[index] + 1;
				}
			}
		}

		for (int i =0; i < 4096; i++)
		{
			hexamerFrequencies[i] = hexamerFrequencies[i] / (double)totalHexamers;
		}
		return hexamerFrequencies;
		
	}

  public double[] getHexamerUsage(char[] sequence, int startPosition, int endPosition)
	{
		double[] hexamerFrequencies = new double[4096];
		String hexamer;
		int index;
		int totalHexamers = 0;

		for (int i = startPosition; i <= endPosition -5; i= i+6)
		{
			hexamer = "";
			if ((i+5) <= endPosition)
			{
				hexamer = hexamer + sequence[i] + sequence[i+1] + sequence[i+2] + sequence[i+3] + sequence[i+4] + sequence[i+5];
				index = getHexamerIndex(hexamer);
				if (index > -1)
				{
					totalHexamers++;
					hexamerFrequencies[index] = hexamerFrequencies[index] + 1;
				}
			}
		}
		return hexamerFrequencies;
	}

   public double[] getCodonUsage(char[] sequence, int startPosition, int endPosition)
    {
	double[] codonFrequencies = new double[64];
	String codon;
	int index;
	int totalCodons = 0;
	for (int i = startPosition; i <= endPosition - 2; i = i + 3)
	    {
		codon = "";
		if ((i + 2) <= endPosition) // make sure don't go over end of sequence
		    {
			codon = codon + sequence[i] + sequence[i+1] + sequence[i+2];
			//System.out.println("Looking for codon: " + codon);
			
			if (codonToIndexMap.containsKey(codon))
			{
				totalCodons++;
				index = ((Integer)codonToIndexMap.get(codon)).intValue();
				codonFrequencies[index] = codonFrequencies[index] + 1;
			}
			else
			{
				System.out.println("Uknown character in codon: " + codon + " ... ignoring.");
			}
		    }	
	    }

	return codonFrequencies; 
	
    }

    public double[] getCodonFrequencies(char[] sequence, int startPosition, int endPosition)
    {
	double[] codonFrequencies = new double[64];
	String codon;
	int index;
	int totalCodons = 0;
	for (int i = startPosition; i <= endPosition - 2; i = i + 3)
	    {
		//System.out.println("i: " + i + " end position " + endPosition);
		codon = "";
		if ((i + 2) <= endPosition) // make sure don't go over end of sequence
		    {
			codon = codon + sequence[i] + sequence[i+1] + sequence[i+2];
			//System.out.println("Looking for codon: " + codon);
			if (codonToIndexMap.containsKey(codon))
			{
				totalCodons++;
				index = ((Integer)codonToIndexMap.get(codon)).intValue();
				codonFrequencies[index] = codonFrequencies[index] + 1;
			}
			else
			{
				System.out.println("Uknown character in codon: " + codon + " ... ignoring.");
			}
		    }	
	    }

	for (int i = 0; i < 64; i++)
	    {
		codonFrequencies[i] = codonFrequencies[i] / (double)totalCodons;
	    }
	return codonFrequencies;
	
    }

   public String getAminoAcidForCodon(String codon)
	{
		return (String)codonToAminoAcidMap.get(codon);		
	}
   
   public char[] translateToAminoAcidSequence(char[] sequence, int startPosition, int endPosition)
    {
	Vector aminoAcidVector = new Vector();
	String codon;
	int index;
	int totalAminoAcids = 0;
	for (int i = startPosition; i <= endPosition - 2; i = i + 3)
	    {
		codon = "";
		if ((i + 2) <= endPosition) // make sure don't go over end of sequence
		    {
			codon = codon + sequence[i] + sequence[i+1] + sequence[i+2];
			String aminoAcid = (String)codonToAminoAcidMap.get(codon);
			if (aminoAcid != null)
			{
			if (!(aminoAcid.equals("Z")))
			{
				aminoAcidVector.add(aminoAcid);
				totalAminoAcids = totalAminoAcids + 1;
			}
			}
		}
	    }
	char[] aminoAcidSequence = new char[totalAminoAcids];    
	for (int i = 0; i < totalAminoAcids; i++)
	{
		aminoAcidSequence[i] = ((String)aminoAcidVector.elementAt(i)).charAt(0);
	}
	return aminoAcidSequence;
     }

    public char[] reverseSequence(char[] sequence)
    {
	int length = sequence.length;
	char[] reverse = new char[length];

	for (int i = 0; i < length; i++)
	    {
		reverse[i] = sequence[length - i - 1];
	    }
	return reverse;
    }


    public char[] complementSequence(char[] sequence)
    {
	int length = sequence.length;
	char[] complement = new char[length];

	for (int i = 0; i < length; i++)
	    {
		char inChar = sequence[i];
		char inCharComplement = '0';
		
		if (inChar == 'A') inCharComplement = 'T';
		else if (inChar == 'C') inCharComplement = 'G';
		else if (inChar == 'G') inCharComplement = 'C';
		else if (inChar == 'T') inCharComplement = 'A';
		else if	(inChar == 'N') inCharComplement = 'N';
		else if (inChar == 'Y') inCharComplement = 'R';
		else if (inChar == 'R') inCharComplement = 'Y';

		complement[i] = inCharComplement;
	    }
	return complement;
    }

    public char[] complementSequenceInPlace(char[] sequence)
    {
	int length = sequence.length;

	for (int i = 0; i < length; i++)
	    {
		char inChar = sequence[i];
		char inCharComplement = '0';
		
		if (inChar == 'A') inCharComplement = 'T';
		else if (inChar == 'C') inCharComplement = 'G';
		else if (inChar == 'G') inCharComplement = 'C';
		else if (inChar == 'T') inCharComplement = 'A';
		else if (inChar == 'N') inCharComplement = 'N';
		else if (inChar == 'Y') inCharComplement = 'R';
		else if (inChar == 'R') inCharComplement = 'Y';

		sequence[i] = inCharComplement;
	    }
	return sequence;
    }

    public int getHexamerIndex(String hexamer)
	{

		int index = 0;
		int nucIndex;
		for (int i = 0; i < 6; i++)
		{
			char inChar = hexamer.charAt(i);
			nucIndex = getNucleotideIndex(inChar);
			if (nucIndex == -1) return -1;
			else index = index + (int)((int)Math.pow(4.0, (5.0-i)) * nucIndex);
		}
		return index;
	}

    public int getNucleotideIndex(char nucleotide)
	{
		if (nucleotide == 'A') return 0;
		else if (nucleotide == 'C') return 1;
		else if (nucleotide == 'G') return 2;
		else if (nucleotide == 'T') return 3;
		else return -1;
	}

    public char[] generateRandomSequence(int length, double gcBias)
	{
		Random random = new Random();
		char[] returnedSequence = new char[length];
		int counter;
		returnedSequence[0] = 'A';
		returnedSequence[1] = 'T';
		returnedSequence[2] = 'G';
		length = length - 3;

		for (counter = 0; counter < length; counter++)
		{
			double randomDouble = random.nextDouble();
			double coin = random.nextDouble();
		
			if (randomDouble <= gcBias)
			{
				if (coin <= .5)
				{
					returnedSequence[counter+3] = 'G';
				}
				else
				{
					returnedSequence[counter+3] = 'C';
				}
			}
			else
			{
				if (coin <= .5)
				{
					returnedSequence[counter+3] = 'A';
				}
				else
				{
					returnedSequence[counter+3] = 'T';
				}
			}
		}

		return returnedSequence;
	}
 

    
}
