// optimize.cpp
// Implementation of a SARSA optimizer

#include <math.h>
#include "random.h"
#include "model.h"
#include "optimize.h"


COptimizer::COptimizer(float nEpsilon, float nLambda, float nAlpha,
											 eSelectType actionSelect) {
	COptimizer::nEpsilon=nEpsilon;
	COptimizer::nLambda=nLambda;	
	COptimizer::nAlpha=nAlpha;	
	COptimizer::actionSelect=actionSelect;	

	pPolicy = new CPolicy(0.);
	nMaxAction=SYSTEM_MODEL->getStocksCount();

	nCycles=0; nSignificantCycles=0;
	
	INFO_MSG(("Optimizer::Optimizer is up with Epsilon value of %f and Lambda value of %f\n",nEpsilon, nLambda));	
}

COptimizer::~COptimizer() {	
	delete pPolicy;

	INFO_MSG(("Optimizer::Optimizer is down\n"));	
}


CAction COptimizer::optimizeSingleCycle() {
	INFO_MSG(("Optimizer::A single cycle of optimization is starting\n"));		
	INFO_MSG(("Optimizer::==========================================\n"));		

	CState S  = SYSTEM_MODEL->getState();

	INFO_MSG(("Optimizer::>Current state (S) is "));		
	S.display(CLog::fLog); INFO_MSG(("\n"));		

	CAction A = selectAction(S);		
	pPolicy->increaseVisits(S,A);	
	
	INFO_MSG(("Optimizer::>Chose to perform first action (A) <%d,%f>\n",A.nStock,A.nPercentage));		

	tReward R = SYSTEM_MODEL->performAction(A);

	INFO_MSG(("Optimizer::>Reward for the action was %lf\n",R));		

	CState SS = SYSTEM_MODEL->getState();

	INFO_MSG(("Optimizer::>New state (SS) is "));		
	SS.display(CLog::fLog); INFO_MSG(("\n"));		

	CAction AA = selectAction(SS);			
	
	INFO_MSG(("Optimizer::>Chose to perform second action (AA) <%d,%f>\n",AA.nStock,AA.nPercentage));		
		
	CAction oldOptimal = selectOptimalAction(S);

	double nNewValue = pPolicy->get(S,A) + alpha(S,A) * 
									   (R + nLambda * pPolicy->get(SS,AA) - pPolicy->get(S,A));	
	pPolicy->set(S,A,nNewValue);	

	CAction newOptimal = selectOptimalAction(S);
	if (oldOptimal!=newOptimal) nSignificantCycles++;
	nCycles++;

	INFO_MSG(("Optimizer::>Updated (S,A) with %f\n",nNewValue));		

	return A;
}

void COptimizer::optimizeMultiCycle(int nCycles) {
	for (int i=0;i <nCycles;i++)
		optimizeSingleCycle();		
}

CAction COptimizer::selectOptimalAction(CState &S) {	
	CAction A;
	double nBestValue = pPolicy->get(S,A), nCurrValue;

	for (int a=1;a <=nMaxAction;a++) {
		if ((nCurrValue = pPolicy->get(S,CAction(a,1.))) >=nBestValue) {
			nBestValue=nCurrValue;
			A.nStock=a;
		}
	}	

	return A;
}

CAction COptimizer::selectAction(CState &S) {
	CAction A;

	// Epsilon-Greedy
	if (actionSelect==EGreedy) {
	  if (SYSTEM_GENERATOR->bernoulli(nEpsilon)) {
		  A.nStock=rand() %(nMaxAction +1);				// Random Action
		  INFO_MSG(("Optimizer::>Chose to perform random action\n"));		
		}
	  else {
		  A=selectOptimalAction(S);								// Optimal Action
		  INFO_MSG(("Optimizer::>Chose to perform optimal action\n"));		
		}
	}
	// Softmax
	else {
		double *pi = new double [nMaxAction +1];
		double nSumExp =0.0, nSumPi =0.0;				

		// Compute the pi values, i.e probability of taking action a in state s
		for (int a=0;a <=nMaxAction;a++) {
			pi[a]=exp(pPolicy->get(S,CAction(a,1.)));		
		  nSumExp+=pi[a];
		}
		for (a=0;a <=nMaxAction;a++)
		  pi[a]=pi[a] /nSumExp;
			
		double rnd = SYSTEM_GENERATOR->uniform();
		A.nStock=0;		
		for (a=0;a <=nMaxAction;a++) {
			if (rnd >nSumPi && rnd <nSumPi +pi[a]) {
				A.nStock=a; break;
			}
		  nSumPi+=pi[a];
		}			

		delete pi;
	}

	return A;
}