/* This is an example HY-PHY Batch File.

   It reads in two  nucleotide datasets (assumed to share the same tree)
   and performs the relative ratio test on a tree, using F81 model on the
   first partition and K81uf on the second. The likelihood ratio statistic 
   is evaluated and the P-value for the test is reported.
  
   Sergei L. Kosakovsky Pond
   September 2005 

*/



/* 1. Read in the data and store the result in  DataSet variables. Also prompt the user 
for a tree (assumed to be in the first data file), by including a stanard queryTree.bf module*/


SetDialogPrompt ("Locate the first nucleotide sequence file:");
DataSet 		nucleotideSequence1 = ReadDataFile (PROMPT_FOR_FILE);

incFileName = HYPHY_BASE_DIRECTORY+"TemplateBatchFiles"+DIRECTORY_SEPARATOR+"queryTree.bf";
ExecuteCommands  ("#include \""+incFileName+"\";");

SetDialogPrompt ("Locate the second nucleotide sequence file:");
DataSet 		nucleotideSequence2 = ReadDataFile (PROMPT_FOR_FILE);

if (nucleotideSequence1.species != nucleotideSequence2.species)
{
	fprintf (stdout, "\nERROR: Both data sets must have the same number of sequences\n");
	return 0;
}

/* 2. Filter the data, specifying that all of the data is to be used
	  and that it is to be treated as nucleotides. */

	  

DataSetFilter	filteredData1 = CreateFilter (nucleotideSequence1,1);
DataSetFilter	filteredData2 = CreateFilter (nucleotideSequence2,1);



/* 3. Collect observed nucleotide frequencies from the filtered data. observedFreqs will
	  store the vector of frequencies. */



HarvestFrequencies (observedFreqs1, filteredData1, 1, 1, 1);
HarvestFrequencies (observedFreqs2, filteredData2, 1, 1, 1);

/* 4. Define the F81 and K81uf substitution matrix. '*' is defined to be -(sum of off-diag row elements) */

F81RateMatrix    =  {{*,t,t,t}{t,*,t,t}{t,t,*,t}{t,t,t,*}};

global			 R_AC = 1; /* Relative to R_AG = 1*/
global			 R_AT = 1;

K81ufRateMatrix  =  {{*,R_AC*t,t,R_AT*t}{R_AC*t,*,R_AT*t,t}{t,R_AT*t,*,R_AC*t}{R_AT*t,t,R_AC*t,*}};

		 
/*5.  Define the models models, by combining the substitution matrix with the vector of observed 
	  (equilibrium) frequencies. */


Model 	F81_Model   = (F81RateMatrix , observedFreqs1);
Model   K81uf_Model = (K81ufRateMatrix , observedFreqs2);


/*6.  Now we can define 2 trees - one for each data block. We use appropriate models for each one.
	  treeString is defined by 'queryTree.bf' */

TRY_NUMERIC_SEQUENCE_MATCH = 1;

UseModel (F81_Model);
Tree	tree_1 = treeString;

UseModel (K81uf_Model);
Tree	tree_2 = treeString;



/*7.  Since all the likelihood function ingredients (data, tree, equilibrium frequencies)
	  have been defined we are ready to construct the likelihood function. We
	  combine both datasets into one likelihood function. */

LikelihoodFunction  theLnLik = (filteredData1, tree_1,filteredData2, tree_2);



/*8.  Maximize the likelihood function, storing parameter values in the matrix paramValues. 
	  We also store the resulting ln-lik and the number of model parameters. */


Optimize (paramValues, theLnLik);
unconstrainedLnLik = paramValues[1][0];
paramCount = paramValues[1][1] + 6	;


/*9.  Print the tree with optimal branch lengths to the console. */
/*    also print the ratio in branch lengths between matching branches */

fprintf  (stdout, "\n1). UNCONSTRAINED MODEL:", theLnLik);

bl1 = BranchLength (tree_1, -1);
bl2 = BranchLength (tree_2, -1);
bn  = BranchName   (tree_1, -1);

fprintf (stdout, "\n\nBranch lengths report\n\nLength in Tree 1, Length in Tree 2, Ratio, Branch Name\n");

for (k=0; k<Columns (bn)-1; k=k+1)
{
	fprintf (stdout, Format (bl1[k], 16, 6), "," , Format (bl2[k], 17, 6), ", ", Format(bl1[k]/bl2[k],6,3), ", ", bn[k], "\n");
}

/*10. We now constrain the rate of evolution along each branch to be proportional in both trees. 
	  R will represent the ratio. We use ReplicateConstraint to automatically
	  attach the same constraint to all branches of the tree. 
	  F*/

global REL_R = 1;
ReplicateConstraint ("this1.?.t:=REL_R*this2.?.t", tree_2, tree_1);
Optimize (paramValues, theLnLik);


/*11. Now we compute the ln-lik ratio statistic and the P-Value, using the Chi^2 dist'n 
	  with an appropriate degree of freedom. */

lnlikDelta = 2 (unconstrainedLnLik-paramValues[1][0]);
pValue = 1-CChi2 (lnlikDelta, paramCount - paramValues[1][1]);

fprintf (stdout, "\n\n1). Relative ratio constraint\n\nLikelihood Ratio Statistic = ", lnLikDelte, "\nP-value = ", pValue, "\n", theLnLik);

bl1 = BranchLength (tree_1, -1);
bl2 = BranchLength (tree_2, -1);
bn  = BranchName   (tree_1, -1);

fprintf (stdout, "\n\nBranch lengths report\n\nLength in Tree 1, Length in Tree 2, Ratio, Branch Name\n");

for (k=0; k<Columns (bn)-1; k=k+1)
{
	fprintf (stdout, Format (bl1[k], 16, 6), "," , Format (bl2[k], 17, 6), ", ", Format(bl1[k]/bl2[k],6,3), ", ", bn[k], "\n");
}