/*****************************************************
 Amino Acid Preference Toolkit in Java
 Pathogen Project
 Department of Computer Science and Engineering
 University of South Carolina
 Columbia, SC 29208
 Contact Email: rose@cse.sc.edu
*****************************************************/

import java.io.*;
import java.util.*;

/**
 * This class is meant to extract the important parts from 
 * a Genbank Coding Region webpage.  The important parts
 * are the start/stop indexes for coding regions.
 * This class should be called as follows:
 *
 * ExtractGenes geneExtracter = new ExtractGenes(genbankFileName);
 *
 * After the class is constructed,
 * the array of start/stop indexes can be obtained by calling
 * 
 * int[][] geneIndex = geneExtracter.getGeneArray();
 * 
 * The first index indexes into genes, the second index is constrained to be either
 * 0, which holds the start index, or 1, which holds the stop index.  
 *
 * The number of genes can be returned by calling
 *
 * int numberOfGenes = geneExtracter.getNumberOfGenes();
 *
 * Modified 07/04/2003
 * - Added the ability to get which strand the genes are on
 * Modified 08/04/2003
 * - Added the ability to get whether this was part of a join or not
 */

public class ExtractGenes
{

    
    int[][] geneArray;
    int numberOfGenes;

    public ExtractGenes(String filename)
    {
	
	try
	    {
		BufferedReader br;
		String line;
		StringTokenizer tokenizer;
		String data;
		Vector genes;
		
		// initialization

		numberOfGenes = 0;
		genes = new Vector();

		// read in the data from the genebank file, hold in a temporary vector because
		// we are unable to store in the array yet because it is not known how many
		// genes there will be

		br = new BufferedReader(new FileReader(filename));

		while ((line = br.readLine()) != null)
		    {
			//System.out.println("Line:" + line);
			tokenizer = new StringTokenizer(line);
			data = tokenizer.nextToken().trim();
			if (!(data.equals("Location")))
			{
				data = data + ".." + tokenizer.nextToken().trim();
				if (tokenizer.hasMoreTokens())
					data = data  + ".." + tokenizer.nextToken().trim();
				else data = data + "..n";
				genes.add(data);
				numberOfGenes++;
			}
		    }

		br.close();
		
		// create the array to hold indexes now that you know how many genes
		// there are

		geneArray = new int[numberOfGenes][4];



		// go through your data vector and 
		// break the data into start, stop indexes
		// and store in the array

		for (int i = 0; i < numberOfGenes; i++)
		    {
			data = (String)genes.elementAt(i);
			//System.out.println("Data: " + data);
			tokenizer = new StringTokenizer(data,"..");
			String startString = tokenizer.nextToken();
			String endString = tokenizer.nextToken();
			if (startString.startsWith(">") || startString.startsWith("<"))
				startString = startString.substring(1);
			if (endString.startsWith(">") || endString.startsWith("<"))
				endString = endString.substring(1);
			int start = Integer.parseInt(startString);
			int end = Integer.parseInt(endString);
			String strand = tokenizer.nextToken();
			String join = tokenizer.nextToken();
			if (strand.equals("+"))
			{
				geneArray[i][0] = start;
				geneArray[i][1] = end;
				geneArray[i][2] = 0;
				if (join.equals("y"))
				{
					geneArray[i][3] = 0;
				}
				else geneArray[i][3] = 1;
				
			}
			else
			{
				geneArray[i][0] = start; 
				geneArray[i][1] = end; 
				geneArray[i][2] = 1;
				if (join.equals("y"))
				{
					geneArray[i][3] = 0;
				}
				else geneArray[i][3] = 1;
			}
		    }

	    }
	catch (Exception exception)
	    {
		System.out.println("Exception: " + exception);
		exception.printStackTrace();
	    }
    }


    public int[][] getGeneArray()
    {
	return geneArray;
    }

    public int getNumberOfGenes()
    {
	return numberOfGenes;
    }



    public void writeToFile(String filename)
    {
	try
	    {
		PrintWriter printWriter = new PrintWriter(new FileWriter(filename));
		
		for (int i = 0; i < numberOfGenes; i++)
		    {
			printWriter.println("\t" + i + "\t" +   geneArray[i][0] + "\t" + geneArray[i][1] +  "\t" + geneArray[i][2] + "\t" + geneArray[i][3]);
		    }

		printWriter.flush();
		printWriter.close();
	    }
	catch (Exception exception)
	    {
		System.out.println("Exception: " + exception);
		exception.printStackTrace();
	    }
    }
    
    public static void main(String[] args)
    {
	if ((args.length < 1) || (args.length > 2))
	    {
		System.out.println("Usage: java ExtractGenes genbankCodingRegionFile [outputFileName]");
	    }
	else
	    {
		String filename = args[0];
		  
		ExtractGenes geneExtracter = new ExtractGenes(filename);
		int[][] genesFromFile = geneExtracter.getGeneArray();
		
		int numberOfGenesFromFile = geneExtracter.getNumberOfGenes();

		for (int i = 0; i < numberOfGenesFromFile; i++)
		    {
			System.out.println("Gene " + i + ": Start => " + genesFromFile[i][0] + " Stop => " + genesFromFile[i][1] + ": Strand => " + genesFromFile[i][2] + ": Join => " + genesFromFile[i][3]);

		    }

		if (args.length == 2)
		    {
			String outputFileName = args[1];
			geneExtracter.writeToFile(outputFileName);
		    }
	    }
    }

    
}
