/***
*
* agent-cmac-dbi.cc
* 
* $Revision: 1.3 $
* 
* Description:
*   This module implements the agent's functions for controlling a
*   double integrator. The function approximation is implemented with
*   cmacs.
*
* Author:
*   Juan Carlos Santamaria
*     E-mail: carlos@cc.gatech.edu
*     URL:    http://www.cc.gatech.edu/ai/students/jcs
* 
* File name:
*   $Id: agent-cmac-dbi.cc,v 1.3 1996/09/19 22:10:22 carlos Exp $
*
* Revision History:
*   $Log: agent-cmac-dbi.cc,v $
*   Revision 1.3  1996/09/19  22:10:22  carlos
*   - The discount factor, GAMMA, is now an agent field instead of a constant.
*     The value can be set at the moment of creation through the constructor.
*     The default value is 0.99.
*   - Eliminate the arguments ps and pa from Agent::step.
*
*   Revision 1.2  1996/08/29  15:53:19  carlos
*   - Now using the constant TERMINAL_STATE instead of 0.
*   - Old local variables are now member variables.
*   - New private function A_CMAC_DBI::policy was added and subsequent
*     modifications.
*   - Change in the order of arguments in Agent::step:
*       old: S,A,r,S'    new: S,A,S',r
*
*   Revision 1.1  1996/08/14  20:54:41  carlos
*   Initial revision
*
****/

#pragma implementation


// -- Include files

#include < iostream.h >
#include < sys/stat.h >

#include "agent-cmac-dbi.h"

#include "usrdef.h"
#include "rand.h"
#include "cmac.h"


// -- Local constants

static const int    NINTERVALS = 12;
static const int    NACTIONS   = 24+1;

static const double EPSILON  = 0.0;
static const double LAMBDA   = 0.7;
static const double ALPHA    = 0.1;

static const double V_0      = 0.0;


// -- Local function declarations

void encode_state( double code[3], const State* ps );

void encode_action( double code[3], const Force *pa );


// -- Member function definitions

//============================================================================
// A_CMAC_DBI::init

void A_CMAC_DBI::init( int argc, char *argv[] )
{
    if (argc< 1) {
        cerr<< "A_CMAC_DBI::init: wrong number of arguments"<< endl;
        cerr<< "init args: rnd_stream [cmac.file] [-v (verbose)]"<< endl;
        exit(1);
    }
    
    cout<< "A_CMAC_DBI::init:"<< endl;

    // set random stream
    rnd_strm = atoi(*argv++);    argc--;
    
    // process command line arguments

    while( argc-->0 ) {
        if ( strcmp(*argv,"-v") == 0 ) 
            verbose=TRUE;
        else
            strcpy(filename,*argv);
        
        argv++;
    }


    bool   load_file = FALSE;
    struct stat buf;
    if ( (strlen(filename)>0) && (stat(filename,&buf)==0) ) {
        // load cmac
        
        load_file = TRUE;
        pcmac = load_cmac_ca( filename );
        if ( pcmac == 0 )
            error("A_CMAC_DBI::init: cannot read file: ",filename);
    }
    else {

        pcmac = new CMAC_CA(3,36);

        // initialize cmac

        int pos_id, vel_id, for_id;
        int ids[3];

        // define cmac variables

        pos_id=pcmac->define_variable("pos",POS_MIN,POS_MAX,
                                      NINTERVALS,CMAC::CLIP);
        vel_id=pcmac->define_variable("vel",VEL_MIN,VEL_MAX,
                                      NINTERVALS,CMAC::CLIP);
        for_id=pcmac->define_variable("for",FOR_MIN,FOR_MAX,
                                      NINTERVALS,CMAC::CLIP);
        // define cmac grids
        
        ids[0]=pos_id;
        ids[1]=vel_id;
        ids[2]=for_id;
        pcmac->define_k_grids( 12, 3, &ids[0] );

        ids[0]=pos_id;
        ids[1]=vel_id;
        pcmac->define_k_grids( 12, 2, &ids[0] );

        ids[0]=pos_id;
        pcmac->define_k_grids(  6, 1, &ids[0] );

        ids[0]=vel_id;
        pcmac->define_k_grids(  6, 1, &ids[0] );

        // define cmac intitial value-function

        pcmac->initialize_value_function( V_0 );
    }

    cout<< "\tprogram:        cmac learning: "<< endl;
    cout<< "\tfile:           ";
    if (load_file)               cout<< "loading "<< filename<< endl;
    else if (strlen(filename)>0) cout<< "will store in "<< filename<< endl;
    else                         cout<< "none"<< endl;
    cout<< "\tcmac definintion:"<< endl;
    cout<< "\t\tvariables: "<< pcmac->nvars<< endl;
    for( int i=0 ; invars ; i++ ) {
        cout<< "\t\t    "<< pcmac->vars[i]->name<< ":\t";
        cout<< "max: "<< pcmac->vars[i]->max_limit<< "  ";
        cout<< "min: "<< pcmac->vars[i]->min_limit<< "  ";
        cout<< "intervals: "<< pcmac->vars[i]->intervals<< endl;
    }
    cout<< "\t\ttillings: "<< pcmac->ngrids<< endl;
    cout<< "\t\t    12 with pos, vel, for"<< endl;
    cout<< "\t\t    12 with pos, vel"<< endl;
    cout<< "\t\t     6 with pos"<< endl;
    cout<< "\t\t     6 with vel"<< endl;
    
    cout<< "\tgamma:          "<< GAMMA<< endl;
    cout<< "\tepsilon:        "<< EPSILON<< endl;
    cout<< "\tlambda:         "<< LAMBDA<< endl;
    cout<< "\talpha:          "<< ALPHA<< endl;
}


//============================================================================
// A_CMAC_DBI::start_trial

Action *A_CMAC_DBI::start_trial( const Sensation *ps )
{
    // clear eligibilities
    
    pcmac->clear_eligibilities();

    // select action
    
     Force *pa = new  Force;

    policy( (State *)ps, pa );
    
    return pa;
}


//============================================================================
// A_CMAC_DBI::step

Action *A_CMAC_DBI::step( const Sensation *pnext_s,
                          double           reward )
{
    if (verbose)
        cerr<< "A_CMAC_DBI::step:"<< endl;
    

    // update eligibility traces

    pcmac->update_eligibilities( GAMMA, LAMBDA );


    // compute previous state-action value
    
    double prev_value;
    prev_value = pcmac->compute_value_function( features );

    
    // replace eligibilities

    pcmac->replace_eligibilities( features );


    // check if we reach a terminal state (ie. out of bounds)
    
    if ( pnext_s == TERMINAL_STATE ) {
        
        // update utility

        double delta = reward - prev_value;
        
        if (verbose) {
            cerr<< "\texpected: ";
            cerr.width(12);
            cerr<< reward<< "   current: ";
            cerr.width(12);
            cerr<< prev_value<< "   delta: ";
            cerr<< delta<< endl;
        }

        pcmac->update_value_function( ALPHA*delta );

        return 0;    // return a dummy force
    }

    // select next action and value function

     Force *pnext_a = new Force;
    double  curr_value;

    curr_value = policy( (State *)pnext_s, pnext_a );

    
    // encode current state and compute current state-action value

    double delta = reward + GAMMA*curr_value - prev_value;
    
    if (verbose) {
        cerr<< "\texpected: ";
        cerr.width(12);
        cerr<< (reward + GAMMA*curr_value)<< "   current: ";
        cerr.width(12);
        cerr<< prev_value<< "   delta: ";
        cerr<< delta<< endl;
    }

    pcmac->update_value_function( ALPHA*delta );

    return pnext_a;
}


//============================================================================
// A_CMAC_DBI::output_value_function_grid

void A_CMAC_DBI::output_value_function_grid( ostream& to )
{
    int    i, j;
    int    features[36];
    double c_pos, c_vel, c_for;
    State  wrk_s;
    Force  wrk_a;
    double code[3];
    double best, tmp;

    c_pos = (POS_MAX-POS_MIN)/NINTERVALS;
    c_vel = (VEL_MAX-VEL_MIN)/NINTERVALS;
    c_for = (FOR_MAX-FOR_MIN)/NACTIONS;
    
    for( j=NINTERVALS-1 ; j >= 0 ; j-- ) {
        for( i=0 ; i < NINTERVALS ; i++ ) {
            wrk_s.pos = POS_MIN + c_pos/2.0 + c_pos*i;
            wrk_s.vel = VEL_MIN + c_vel/2.0 + c_vel*j;
            wrk_a.acc = FOR_MIN + c_for/2.0;

            encode_state( code, &wrk_s );
            encode_action( code, &wrk_a  );
            
            pcmac->get_tiles( features, code);
            best = pcmac->compute_value_function(features);
            
            for( int a=1 ; a < NACTIONS ; a++ ) {
                wrk_a.acc = FOR_MIN + c_for/2.0 + a*c_for;
                encode_action( code, &wrk_a );
                
                pcmac->get_tiles( features, code);
                tmp = pcmac->compute_value_function(features);
                if ( tmp > best)
                    best = tmp;
            }

            to<< best<< endl;
        }
        to<< endl;
    }
}


//============================================================================
// A_CMAC_DBI::policy

double A_CMAC_DBI::policy( const State *ps, Force *pa )
{
    if (verbose)
        cerr<< "A_CMAC_DBI::policy:"<< endl;

    double  code[3];
    double  value;
    int     wrk_features[36];
    double  delta_a = (FOR_MAX-FOR_MIN)/NACTIONS;

    // encode state with the minimum value for force first

    encode_state( code, ps );
    
    double qvalues[NACTIONS];

    // analize all possibilities
    
    for( int a=0 ; a < NACTIONS ; a++ ) {
        pa->acc = FOR_MIN + delta_a/2.0 + a*delta_a;
        encode_action( code, pa );
        pcmac->get_tiles( wrk_features, code );
        qvalues[a] = pcmac->compute_value_function(wrk_features);
    }

    // greedy search
    
    int action = select_best(NACTIONS,qvalues,1.0-EPSILON,rnd_strm);
    pa->acc    = FOR_MIN + delta_a/2.0 + action*delta_a;
    value      = qvalues[action];

    // store state-action features of decision
    
    encode_action( code, pa );
    pcmac->get_tiles( features, code );

    if (verbose) {
        cerr<< "\tvalue:        "<< value<< endl;
        cerr<< "\tacceleration: "<< pa->acc<< endl;
    
        for( int k=0 ;k < NACTIONS ; k++ ) {
            cerr<< "\t";
            cerr.width(10);
            cerr<< qvalues[k];
        }
        cerr<< endl;

        cerr<< "\t";
        for( int k=0 ; k < 36 ; k++ )
            cerr<< features[k]<< " ";
        cerr<< endl;
    }
   
    return value;
}


// -- Local function definitions

void encode_state( double code[3], const State *ps )
{
    code[POS] = ps->pos;
    code[VEL] = ps->vel;;
}


void encode_action( double code[3], const Force *pa )
{
    code[FOR] = pa->acc;
}


/****************************** end of file *********************************/