#include "McRateUtils.h"
using namespace McRateUtils;

#include "someUtil.h"
#include "errorMsg.h"

#include <cmath>


using namespace std;





//scale vecToScale so that its new average is AvgIn. return the scaling factor. 
MDOUBLE McRateUtils::scaleVec(Vdouble& vecToScale, const MDOUBLE avgIn)
{
	int vecSize = vecToScale.size();
	MDOUBLE sum = 0;
	for (int x = 0; x < vecSize; ++x)
	{
		sum += vecToScale[x];
	}

	MDOUBLE avg = sum / vecSize;
	MDOUBLE scaleFactor = avgIn / avg;
	
	for (int i = 0; i<vecSize; ++i)
	{
		vecToScale[i] *= scaleFactor;
	}

	MDOUBLE newAvg = computeAverage(vecToScale);
	if (fabs(newAvg - avgIn) > 0.001)
		errorMsg::reportError(" problem - scalled average is not avgIn after scalling!!!");
	return scaleFactor;
}


// page 565-566, Sokal and Rohlf
MDOUBLE McRateUtils::calcCorrelationCoefficient(const Vdouble& xVec, const Vdouble& yVec)
{
	if (xVec.size() != yVec.size())
	{
		errorMsg::reportError("the two vectors to correlate are not the same size in SimulateRates::calcCorrelationCoefficient");
	}

	int seqLength = xVec.size();
	MDOUBLE x_avg = computeAverage(xVec);
	MDOUBLE y_avg = computeAverage(yVec);
	MDOUBLE sum_x_square = 0;
	MDOUBLE sum_y_square = 0;
	MDOUBLE sum_xy = 0;
	MDOUBLE x_diff, y_diff;
	for (int i = 0; i<seqLength; ++i)
	{
		x_diff = xVec[i] - x_avg;
		y_diff = yVec[i] - y_avg;
		sum_x_square += pow(x_diff, 2);
		sum_y_square += pow(y_diff, 2);
		sum_xy += x_diff * y_diff;
	}

	MDOUBLE r_xy = sum_xy / sqrt(sum_x_square * sum_y_square);
	return r_xy;
}








//loadRatesDistributionFromFile: read rates from a rate4site output file and return in ratesVec
//colNum is the rates column to be read
void McRateUtils::loadRatesDistributionFromFile(string ratesDistFile, Vdouble& ratesVec, int colNum)
{
	ifstream ratesFile(ratesDistFile.c_str());
	vector<string> distFileData;
	putFileIntoVectorStringArray(ratesFile,distFileData);
	if (distFileData.empty()){
		errorMsg::reportError("unable to open file, or file is empty in SimulateRates::loadRatesDistFromFile");
	}


	vector<string>::const_iterator it= distFileData.begin();
	for (; it!= distFileData.end(); ++it) 
	{
		if (it->empty())
		{// empty line continue
			continue; 
		}
		if ((*it)[0]=='#')
		{// remark line 
			continue;  
		}

		if (colNum == 2)
		{

			//in ratesDistFile: only pos# and rate 
			int startRate = 1+ it->find("\t", 0);
			int endRate = 1+ it->find("\t", startRate);
			if (startRate>0)
			{
				string rateStr = it->substr(startRate, endRate-startRate -1);
				MDOUBLE rate = string2double(rateStr);
				ratesVec.push_back(rate);
			}
		}
		else if (colNum == 3)
		{
			//in ratesDistFile: column1 is pos#, column2 = aa, column3 = rate 
			int startAA = 1+ it->find("\t", 0);
			int startRate = 1+ it->find("\t", startAA);
			int endRate = 1+ it->find("\t", startRate);
			if (startAA>0)
			{
				string AA = it->substr(startAA, startRate-startAA -1);
				string rateStr = it->substr(startRate, endRate-startRate -1);
				MDOUBLE rate = string2double(rateStr);
				ratesVec.push_back(rate);
			}
		}
		else
			errorMsg::reportError("unknown number of columns in SimulateRates::loadRatesDistributionFromFile");

	}
}

