/******************************************************************************
*
* gambler.c
*
* Policy Iteration for the Gambler problem (example 4.3)
*
* Huy N Pham <huy_n_pham@yahoo.com>
*
******************************************************************************/
#include <stdio.h> #include <math.h> /*
* Prototypes
*/
void show_V(); void show_POL(); void initialize(); void evaluate_policy(); double update_state_val(); void improve_policy(); int argmax(int s); int min(int a, int b); /* How close the approximation should be */ #define THETA 0.000001 /* How far sighted should we be */ #define GAMMA 1 // <-- Very /* Probability for the coin to come up head (ie, win) */ #define ODDVAL 0.4 /* V[s] represents the values of 99 normal states and 2 terminal states */ double V[101]; // <-- s = {0..100}, where V[0] and V[100] are ALWAYS 0 /* P[s][a][s'] represents the prob of ending up in s' from state s by doing a */ double P[100][51][101]; // <-- s = {1..99}, a = {1..50}, s' = {0..100} /* POL[s] represent the amount to bet at every state s */ int POL[100]; // <-- s = {1..99} /* R[s][a][s'] represent the reward recieved for getting to s' from s by doing a */ double R[100][51][101]; // <-- s = {1..99}, a = {1..50}, s' = {0..100} /* Is the current policy stable? */ int policy_stable; // <-- No if 0, Yes if otherwise. /*
* main()
*
* Policy Iteration
*/

int main(int argc, char *argv[]){ initialize();

do{ evaluate_policy(); // <== Policy Evaluation (for current policy) improve_policy(); // <== Policy Improvement }while (!policy_stable); show_V(); show_POL(); exit(0); } /*
* initialize()
*
* Data structures initialization
*/
void initialize(){ int i, j, k; /*
* Info and User inputs
*/
printf("\n\n"); printf("Policy Iteration for the Gambler problem in example 4.3\n"); printf("by Huy N Pham <huy_n_pham@yahoo.com>\n"); printf("\n"); /*
* Start with a zero state function and
* a random policy (alway bet 1 dollar)
*
* Note that Terminal states values are also initialized to 0
*/
for(i = 0; i < 101; i++){ // <-- Terminal states are included V[i] = 0; POL[i] = 1; } /*
* Initialize P[s][a][s']
*/
for(i = 1; i < 100; i++){ // <== 99 possible s for(j = 1; j <= min(i, 100 - i); j++){ // <== 50 possible a for(k = 0; k < 101; k++){ // <== 101 possible s' if (k == (i + j)) // <== the coin came up Head P[i][j][k] = ODDVAL; else if (k == (i - j)) // <== the coin came up Tail P[i][j][k] = 1 - ODDVAL;
else
P[i][j][k] = 0; // <== no other case is possible } } } /*
* Initialize R[s][a][s']
*/
for(i = 1; i < 100; i++){ // <== 99 possible s for(j = 1; j <= min(i, 100 - i); j++){ // <== 50 possible a for(k = 0; k < 101; k++){ // <== 101 possible s' if (k == 100) // <== We won the game R[i][j][k] = 1; // else if (k == 0) // <== damm! // R[i][j][k] = -1; else R[i][j][k] = 0; } } } return; } /*
* show_V()
*
* Printout the state values for all states
*/
void show_V(){ int i; printf(" STATE VALUES\n"); printf(" | 1 2 3 4 5 6 7 8 9 10\n"); printf(" -----+------------------------------------------------------------\n"); printf(" 0 |"); for(i = 1; i < 100; i++){ printf(" %5.2f", V[i]); if ((i % 10) == 0){ printf("\n %3d |", i); } } printf("\n"); printf("\n"); } /*
* show_POL()
*
* Printout the policy
*/
void show_POL(){ int i; printf(" POLICY\n"); printf(" | 1 2 3 4 5 6 7 8 9 10\n"); printf(" -----+----------------------------------------\n"); printf(" 0 |"); for(i = 1; i < 100; i++){ printf(" %3d", POL[i]); if ((i % 10) == 0){ printf("\n %3d |", i); } } printf("\n"); printf("\n"); } /*
* evaluate_policy()
*
* Policy Evaluation: Iteratively update V(s) for all states s.
*/

void evaluate_policy(){ int i; double delta = 0; double v; /*
* Iterate until V(s) converse
*/
do{ delta = 0; /*
* Iteratively update each state in the state space
*
* Note: Since terminal states should always have values
* of zero (once we are in these state, no more reward
* can be possible), they are excluded here and won't
* get updated.
*/
for(i = 1; i < 100; i++){ // <-- terminal states excluded v = V[i]; V[i] = update_state_val(i); delta = (delta > fabs(v - V[i]) ? delta : fabs(v - V[i])); } }while (delta >= THETA); return; } /*
* update_state_val()
*
* Update a state value using Bellman equation
*/
double update_state_val(int s){ double v; int a; /*
* In this particular problem,
* from any given state s, we have only two possible s':
*
* s' = s + POL[s], with probability ODDVAL
* OR: s' = s - POL[s], with probability (1 - ODDVAL)
*
* Therefore, our equation has 2 terms
*/
a = POL[s];

v = P[s][a][s+a] * (R[s][a][s+a] + (GAMMA * V[s+a])) + \
P[s][a][s-a] * (R[s][a][s-a] + (GAMMA * V[s-a])); return v;
} /*
* improve_policy()
*
* Use the current state value function to select the best action for every steps
*/
void improve_policy(){ int i; int old_action; policy_stable = 1; for(i = 1; i < 100; i++){ // <== 99 possible states old_action = POL[i]; POL[i] = argmax(i); if (POL[i] != old_action) policy_stable = 0;
} return; } /*
* argmax()
*
* Return the best (greedy) action for s
*/
int argmax(int s){ int i; int best_action = 1; float best_reward = 0; float scoreboard[51]; /*
* Try every possible action and record their corresponding rewards
*/
for(i = 1; i <= min(s, 100 - s); i++){
scoreboard[i] = ODDVAL * (R[s][i][s+i] + (GAMMA * V[s+i])) + \
(1-ODDVAL) * (R[s][i][s-i] + (GAMMA * V[s-i]));
} /*
* Which action was the best?
*/
for(i = 1; i <= min(s, 100-s); i++){ if (scoreboard[i] > best_reward){ best_reward = scoreboard[i]; best_action = i; } } return best_action; } /*
* min()
*
* Return the smaller of 2 intergers
*/
int min(int a, int b){ return ((a < b) ? a : b); }