Source code for btgym.research.gps.aac

import tensorflow as tf

from btgym.algorithms import BaseAAC
from .loss import guided_aac_loss_def_0_0, guided_aac_loss_def_0_1, guided_aac_loss_def_0_3
from btgym.algorithms.runner.synchro import BaseSynchroRunner, VerboseSynchroRunner


[docs]class GuidedAAC(BaseAAC): """ Actor-critic framework augmented with expert actions imitation loss: L_gps = aac_lambda * L_a3c + guided_lambda * L_im. This implementation is loosely refereed as 'guided policy search' after algorithm described in paper by S. Levine and P. Abbeel `Learning Neural Network Policies with Guided PolicySearch under Unknown Dynamics` in a sense that exploits idea of fitting 'local' (here - single episode) oracle for environment with generally unknown dynamics and use actions demonstrated by it to optimize trajectory distribution for training agent. Note that this particular implementation of expert does not provides complete action-state space trajectory for agent to follow. Instead it estimates `advised` categorical distribution over actions conditioned on `external` (i.e. price dynamics) state observations only. Papers: - Levine et al., 'Learning Neural Network Policies with Guided PolicySearch under Unknown Dynamics' https://people.eecs.berkeley.edu/~svlevine/papers/mfcgps.pdf - Brys et al., 'Reinforcement Learning from Demonstration through Shaping' https://www.ijcai.org/Proceedings/15/Papers/472.pdf - Wiewiora et al., 'Principled Methods for Advising Reinforcement Learning Agents' http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.14.6412&rep=rep1&type=pdf """ def __init__( self, expert_loss=guided_aac_loss_def_0_3, aac_lambda=1.0, guided_lambda=1.0, guided_decay_steps=None, runner_config=None, # aux_render_modes=('action_prob', 'value_fn', 'lstm_1_h', 'lstm_2_h'), aux_render_modes=None, name='GuidedA3C', **kwargs ): """ Args: expert_loss: callable returning tensor holding on_policy imitation loss graph and summaries aac_lambda: float, main on_policy a3c loss lambda guided_lambda: float, imitation loss lambda guided_decay_steps: number of steps guided_lambda is annealed to zero name: str, name scope **kwargs: see BaseAAC kwargs """ try: self.expert_loss = expert_loss self.aac_lambda = aac_lambda self.guided_lambda = guided_lambda * 1.0 self.guided_decay_steps = guided_decay_steps self.guided_lambda_decayed = None self.train_guided_lambda = None if runner_config is None: runner_config = { 'class_ref': BaseSynchroRunner, 'kwargs': { 'aux_render_modes': aux_render_modes, # ('action_prob', 'value_fn', 'lstm_1_h', 'lstm_2_h'), } } super(GuidedAAC, self).__init__( runner_config=runner_config, name=name, aux_render_modes=aux_render_modes, **kwargs ) except: msg = 'GuidedAAC.__init()__ exception occurred' + \ '\n\nPress `Ctrl-C` or jupyter:[Kernel]->[Interrupt] for clean exit.\n' self.log.exception(msg) raise RuntimeError(msg) def _make_loss(self, **kwargs): """ Augments base loss with expert actions imitation loss Returns: tensor holding estimated loss graph list of related summaries """ aac_loss, summaries = self._make_base_loss(**kwargs) # Guidance annealing: if self.guided_decay_steps is not None: self.guided_lambda_decayed = tf.train.polynomial_decay( self.guided_lambda, self.global_step + 1, self.guided_decay_steps, 0.0, power=1, cycle=False, ) else: self.guided_lambda_decayed = self.guided_lambda # Switch to zero when testing - prevents information leakage: self.train_guided_lambda = self.guided_lambda_decayed * tf.cast(self.local_network.train_phase, tf.float32) self.guided_loss, guided_summary = self.expert_loss( pi_actions=self.local_network.on_logits, expert_actions=self.local_network.expert_actions, name='on_policy', verbose=True, guided_lambda=self.train_guided_lambda ) loss = self.aac_lambda * aac_loss + self.guided_loss summaries += guided_summary self.log.notice( 'guided_lambda: {:1.6f}, guided_decay_steps: {}'.format(self.guided_lambda, self.guided_decay_steps) ) return loss, summaries
[docs]class VerboseGuidedAAC(GuidedAAC): """ Extends parent `GuidedAAC` class with additional summaries. """ def __init__( self, runner_config=None, aux_render_modes=('action_prob', 'value_fn', 'lstm_1_h', 'lstm_2_h'), name='VerboseGuidedA3C', **kwargs ): super(VerboseGuidedAAC, self).__init__( name=name, runner_config={ 'class_ref': VerboseSynchroRunner, 'kwargs': { 'aux_render_modes': aux_render_modes, } }, aux_render_modes=aux_render_modes, **kwargs )