Source code for btgym.research.strategy_gen_4

import numpy as np
from scipy.stats import zscore

import backtrader as bt
import backtrader.indicators as btind

from btgym.strategy.base import BTgymBaseStrategy
from btgym.strategy.utils import tanh, abs_norm_ratio, exp_scale, discounted_average, log_transform

from gym import spaces
from btgym import DictSpace

"""
Research grade code. Can be unstable, buggy, poor performing and generally is subject to change.
"""


[docs]class DevStrat_4_6(BTgymBaseStrategy):
    """
    Objectives:
        external state data feature search:
            time_embedded three-channeled vector:
                - `Open` channel is one time-step difference of Open price;
                - `High` and `Low` channels are differences
                  between current Open price and current High or Low prices respectively

        internal state data feature search:
            time_embedded concatenated vector of broker and portfolio statistics
            time_embedded vector of last actions recieved (one-hot)
            time_embedded vector of rewards

        reward shaping search:
           potential-based shaping functions

    Data:
        synthetic/real
    """

    # Time embedding period:
    time_dim = 30  # NOTE: changed this --> change Policy  UNREAL for aux. pix control task upsampling params

    # Number of environment steps to skip before returning next response,
    # e.g. if set to 10 -- agent will interact with environment every 10th step;
    # every other step agent action is assumed to be 'hold':
    skip_frame = 10

    # Number of timesteps reward estimation statistics are averaged over, should be:
    # skip_frame_period <= avg_period <= time_embedding_period:
    avg_period = time_dim

    # Possible agent actions:
    portfolio_actions = ('hold', 'buy', 'sell', 'close')

    params = dict(
        # Note: fake `Width` dimension to use 2d conv etc.:
        state_shape=
            {
                'external': spaces.Box(low=-1, high=1, shape=(time_dim, 1, 3), dtype=np.float32),
                'internal': spaces.Box(low=-2, high=2, shape=(avg_period, 1, 5), dtype=np.float32),
                'metadata': DictSpace(
                    {
                        'type': spaces.Box(
                            shape=(),
                            low=0,
                            high=1,
                            dtype=np.uint32
                        ),
                        'trial_num': spaces.Box(
                            shape=(),
                            low=0,
                            high=10**10,
                            dtype=np.uint32
                        ),
                        'trial_type': spaces.Box(
                            shape=(),
                            low=0,
                            high=1,
                            dtype=np.uint32
                        ),
                        'sample_num': spaces.Box(
                            shape=(),
                            low=0,
                            high=10**10,
                            dtype=np.uint32
                        ),
                        'first_row': spaces.Box(
                            shape=(),
                            low=0,
                            high=10**10,
                            dtype=np.uint32
                        ),
                        'timestamp': spaces.Box(
                            shape=(),
                            low=0,
                            high=np.finfo(np.float64).max,
                            dtype=np.float64
                        ),
                    }
                )
            },
        cash_name='default_cash',
        asset_names=['default_asset'],
        start_cash=None,
        commission=None,
        leverage=1.0,
        drawdown_call=5,
        target_call=19,
        portfolio_actions=portfolio_actions,
        initial_action=None,
        initial_portfolio_action=None,
        skip_frame=skip_frame,
        state_ext_scale=2e3,  # EURUSD
        state_int_scale=1.0,  # not used
        metadata={}
    )

    def __init__(self, **kwargs):
        """

        Args:
            **kwargs:   see BTgymBaseStrategy args.
        """
        super(DevStrat_4_6, self).__init__(**kwargs)
        self.state['metadata'] = self.metadata

        self.log.debug('DEV_state_shape: {}'.format(self.p.state_shape))
        self.log.debug('DEV_skip_frame: {}'.format(self.p.skip_frame))
        self.log.debug('DEV_portfolio_actions: {}'.format(self.p.portfolio_actions))
        self.log.debug('DEV_drawdown_call: {}'.format(self.p.drawdown_call))
        self.log.debug('DEV_target_call: {}'.format(self.p.target_call))
        self.log.debug('DEV_dataset_stat:\n{}'.format(self.p.dataset_stat))
        self.log.debug('DEV_episode_stat:\n{}'.format(self.p.episode_stat))

    def set_datalines(self):

        # Define data channels:
        self.channel_O = bt.Sum(self.data.open, - self.data.open(-1))
        self.channel_H = bt.Sum(self.data.high, - self.data.open)
        self.channel_L = bt.Sum(self.data.low, - self.data.open)

    def get_external_state(self):

        x = np.stack(
            [
                np.frombuffer(self.channel_O.get(size=self.time_dim)),
                np.frombuffer(self.channel_H.get(size=self.time_dim)),
                np.frombuffer(self.channel_L.get(size=self.time_dim)),
            ],
            axis=-1
        )
        # Amplify and squash in [-1,1], seems to be best option as of 4.10.17:
        # `self.p.state_ext_scale` param is supposed to keep most of the signal
        # in 'linear' part of tanh while squashing spikes.
        x_market = tanh(x * self.p.state_ext_scale)

        return x_market[:, None, :]


[docs]class DevStrat_4_7(DevStrat_4_6):
    """
    4_6 + Sliding statistics avg_period disentangled from time embedding dim;
    Only one last step sliding stats are used for internal state;
    Reward weights: 1, 2, 10 , reward scale factor aded;
    """

    # Time embedding period:
    time_dim = 30  # NOTE: changed this --> change Policy  UNREAL for aux. pix control task upsampling params

    # Number of environment steps to skip before returning next response,
    # e.g. if set to 10 -- agent will interact with environment every 10th step;
    # every other step agent action is assumed to be 'hold':
    skip_frame = 10

    # Number of timesteps reward estimation statistics are averaged over, should be:
    # skip_frame_period <= avg_period <= time_embedding_period:
    avg_period = 20

    # Possible agent actions:
    portfolio_actions = ('hold', 'buy', 'sell', 'close')

    gamma = 1.0  # fi_gamma, should be MDP gamma decay

    reward_scale = 1.0  # reward scaler

    params = dict(
        # Note: fake `Width` dimension to use 2d conv etc.:
        state_shape=
        {
            'external': spaces.Box(low=-1, high=1, shape=(time_dim, 1, 3), dtype=np.float32),
            'internal': spaces.Box(low=-2, high=2, shape=(1, 1, 5), dtype=np.float32),
            'metadata': DictSpace(
                {
                    'type': spaces.Box(
                        shape=(),
                        low=0,
                        high=1,
                        dtype=np.uint32
                    ),
                    'trial_num': spaces.Box(
                        shape=(),
                        low=0,
                        high=10 ** 10,
                        dtype=np.uint32
                    ),
                    'trial_type': spaces.Box(
                        shape=(),
                        low=0,
                        high=1,
                        dtype=np.uint32
                    ),
                    'sample_num': spaces.Box(
                        shape=(),
                        low=0,
                        high=10 ** 10,
                        dtype=np.uint32
                    ),
                    'first_row': spaces.Box(
                        shape=(),
                        low=0,
                        high=10 ** 10,
                        dtype=np.uint32
                    ),
                    'timestamp': spaces.Box(
                        shape=(),
                        low=0,
                        high=np.finfo(np.float64).max,
                        dtype=np.float64
                    ),
                }
            )
        },
        cash_name='default_cash',
        asset_names=['default_asset'],
        start_cash=None,
        commission=None,
        leverage=1.0,
        drawdown_call=5,
        target_call=19,
        portfolio_actions=portfolio_actions,
        initial_action=None,
        initial_portfolio_action=None,
        skip_frame=skip_frame,
        gamma=gamma,
        reward_scale=1.0,
        state_ext_scale=2e3,  # EURUSD
        state_int_scale=1.0,  # not used
        metadata={}
    )

    def __init__(self, **kwargs):
        super(DevStrat_4_7, self).__init__(**kwargs)

    def get_internal_state(self):
        x_broker = np.stack(
            [
                self.broker_stat['value'][-1],
                self.broker_stat['unrealized_pnl'][-1],
                self.broker_stat['realized_pnl'][-1],
                self.broker_stat['cash'][-1],
                self.broker_stat['exposure'][-1],
            ]
        )
        return x_broker[None, None, :]


[docs]class DevStrat_4_8(DevStrat_4_7):
    """
    4_7 + Uses full average_period of inner stats for use with inner_conv_encoder.
    """
    # Time embedding period:
    time_dim = 30  # NOTE: changed this --> change Policy  UNREAL for aux. pix control task upsampling params

    # Number of environment steps to skip before returning next response,
    # e.g. if set to 10 -- agent will interact with environment every 10th step;
    # every other step agent action is assumed to be 'hold':
    skip_frame = 10

    # Number of timesteps reward estimation statistics are averaged over, should be:
    # skip_frame_period <= avg_period <= time_embedding_period:
    # !..-> here it is also `broker state` time-embedding period
    avg_period = 20

    # Possible agent actions:
    portfolio_actions = ('hold', 'buy', 'sell', 'close')

    gamma = 1.0  # fi_gamma, should be MDP gamma decay, but somehow undiscounted works better <- wtf?

    reward_scale = 1  # reward multiplicator

    params = dict(
        # Note: fake `Width` dimension to use 2d conv etc.:
        state_shape=
        {
            'external': spaces.Box(low=-1, high=1, shape=(time_dim, 1, 3), dtype=np.float32),
            'internal': spaces.Box(low=-2, high=2, shape=(avg_period, 1, 5), dtype=np.float32),
            'metadata': DictSpace(
                {
                    'type': spaces.Box(
                        shape=(),
                        low=0,
                        high=1,
                        dtype=np.uint32
                    ),
                    'trial_num': spaces.Box(
                        shape=(),
                        low=0,
                        high=10 ** 10,
                        dtype=np.uint32
                    ),
                    'trial_type': spaces.Box(
                        shape=(),
                        low=0,
                        high=1,
                        dtype=np.uint32
                    ),
                    'sample_num': spaces.Box(
                        shape=(),
                        low=0,
                        high=10 ** 10,
                        dtype=np.uint32
                    ),
                    'first_row': spaces.Box(
                        shape=(),
                        low=0,
                        high=10 ** 10,
                        dtype=np.uint32
                    ),
                    'timestamp': spaces.Box(
                        shape=(),
                        low=0,
                        high=np.finfo(np.float64).max,
                        dtype=np.float64
                    ),
                }
            )
        },
        cash_name='default_cash',
        asset_names=['default_asset'],
        start_cash=None,
        commission=None,
        leverage=1.0,
        drawdown_call=5,
        target_call=19,
        portfolio_actions=portfolio_actions,
        initial_action=None,
        initial_portfolio_action=None,
        skip_frame=skip_frame,
        gamma=gamma,
        reward_scale=1.0,
        state_ext_scale=2e3,  # EURUSD
        state_int_scale=1.0,  # not used
        metadata={},
    )

    def get_internal_state(self):
        x_broker = np.concatenate(
            [
                np.asarray(self.broker_stat['value'])[..., None],
                np.asarray(self.broker_stat['unrealized_pnl'])[..., None],
                np.asarray(self.broker_stat['realized_pnl'])[..., None],
                np.asarray(self.broker_stat['cash'])[..., None],
                np.asarray(self.broker_stat['exposure'])[..., None],
                # np.asarray(self.sliding_stat['episode_step'])[..., None],
                # np.asarray(self.sliding_stat['reward'])[..., None],
                # np.asarray(self.sliding_stat['action'])[..., None],
                # norm_position_duration[...,None],
                # max_unrealized_pnl[..., None],
                # min_unrealized_pnl[..., None],
            ],
            axis=-1
        )
        return x_broker[:, None, :]


[docs]class DevStrat_4_9(DevStrat_4_7):
    """
    4_7 + Uses simple SMA market state features.
    """
    # Time embedding period:
    time_dim = 30  # NOTE: changed this --> change Policy  UNREAL for aux. pix control task upsampling params

    # Number of environment steps to skip before returning next response,
    # e.g. if set to 10 -- agent will interact with environment every 10th step;
    # every other step agent action is assumed to be 'hold':
    skip_frame = 10

    # Number of timesteps reward estimation statistics are averaged over, should be:
    # skip_frame_period <= avg_period <= time_embedding_period:
    avg_period = 20

    # Possible agent actions:
    portfolio_actions = ('hold', 'buy', 'sell', 'close')

    gamma = 1.0  # fi_gamma, should be MDP gamma decay

    reward_scale = 1  # reward multiplicator, touchy!

    params = dict(
        # Note: fake `Width` dimension to use 2d conv etc.:
        state_shape=
        {
            'external': spaces.Box(low=-100, high=100, shape=(time_dim, 1, 8), dtype=np.float32),
            'internal': spaces.Box(low=-2, high=2, shape=(1, 1, 5), dtype=np.float32),
            'metadata': DictSpace(
                {
                    'type': spaces.Box(
                        shape=(),
                        low=0,
                        high=1,
                        dtype=np.uint32
                    ),
                    'trial_num': spaces.Box(
                        shape=(),
                        low=0,
                        high=10 ** 10,
                        dtype=np.uint32
                    ),
                    'trial_type': spaces.Box(
                        shape=(),
                        low=0,
                        high=1,
                        dtype=np.uint32
                    ),
                    'sample_num': spaces.Box(
                        shape=(),
                        low=0,
                        high=10 ** 10,
                        dtype=np.uint32
                    ),
                    'first_row': spaces.Box(
                        shape=(),
                        low=0,
                        high=10 ** 10,
                        dtype=np.uint32
                    ),
                    'timestamp': spaces.Box(
                        shape=(),
                        low=0,
                        high=np.finfo(np.float64).max,
                        dtype=np.float64
                    ),
                }
            )
        },
        cash_name='default_cash',
        asset_names=['default_asset'],
        start_cash=None,
        commission=None,
        leverage=1.0,
        drawdown_call=5,
        target_call=19,
        portfolio_actions=portfolio_actions,
        initial_action=None,
        initial_portfolio_action=None,
        skip_frame=skip_frame,
        gamma=gamma,
        reward_scale=1.0,
        state_ext_scale=1e4,  # EURUSD
        state_int_scale=1.0,  # not used
        metadata={},
    )

    def set_datalines(self):
        self.data.sma_4 = btind.SimpleMovingAverage(self.datas[0], period=4)
        self.data.sma_8 = btind.SimpleMovingAverage(self.datas[0], period=8)
        self.data.sma_16 = btind.SimpleMovingAverage(self.datas[0], period=16)
        self.data.sma_32 = btind.SimpleMovingAverage(self.datas[0], period=32)
        self.data.sma_64 = btind.SimpleMovingAverage(self.datas[0], period=64)
        self.data.sma_128 = btind.SimpleMovingAverage(self.datas[0], period=128)
        self.data.sma_256 = btind.SimpleMovingAverage(self.datas[0], period=256)

        self.data.dim_sma = btind.SimpleMovingAverage(
            self.datas[0],
            period=(256 + self.time_dim)
        )
        self.data.dim_sma.plotinfo.plot = False

    def get_external_state(self):

        x = np.stack(
            [
                np.frombuffer(self.data.open.get(size=self.time_dim)),
                np.frombuffer(self.data.sma_4.get(size=self.time_dim)),
                np.frombuffer(self.data.sma_8.get(size=self.time_dim)),
                np.frombuffer(self.data.sma_16.get(size=self.time_dim)),
                np.frombuffer(self.data.sma_32.get(size=self.time_dim)),
                np.frombuffer(self.data.sma_64.get(size=self.time_dim)),
                np.frombuffer(self.data.sma_128.get(size=self.time_dim)),
                np.frombuffer(self.data.sma_256.get(size=self.time_dim)),
            ],
            axis=-1
        )
        # Gradient along features axis:
        x = np.gradient(x, axis=1) * self.p.state_ext_scale

        # Log-scale:
        x = log_transform(x)
        return x[:, None, :]


[docs]class DevStrat_4_10(DevStrat_4_7):
    """
    4_7 + Reward search: log-normalised potential functions. Nope.
    """

    # def get_reward(self):
    #     """
    #     Shapes reward function as normalized single trade realized profit/loss,
    #     augmented with potential-based reward shaping functions in form of:
    #     F(s, a, s`) = gamma * FI(s`) - FI(s);
    #
    #     - potential FI_1 is current normalized unrealized profit/loss;
    #     - potential FI_2 is current normalized broker value.
    #     - FI_3: penalizing exposure toward the end of episode
    #
    #     Paper:
    #         "Policy invariance under reward transformations:
    #          Theory and application to reward shaping" by A. Ng et al., 1999;
    #          http://www.robotics.stanford.edu/~ang/papers/shaping-icml99.pdf
    #     """
    #
    #     # All sliding statistics for this step are already updated by get_state().
    #     debug = {}
    #     scale = 10.0
    #     # Potential-based shaping function 1:
    #     # based on log potential of averaged profit/loss for current opened trade (unrealized p/l):
    #     unrealised_pnl = np.asarray(self.broker_stat['unrealized_pnl']) / 2 + 1 # shift [-1,1] -> [0,1]
    #     # TODO: make normalizing util func to return in [0,1] by default
    #     f1 = self.p.gamma * np.log(np.average(unrealised_pnl[1:])) - np.log(np.average(unrealised_pnl[:-1]))
    #
    #     debug['f1'] = f1
    #
    #     # Potential-based shaping function 2:
    #     # based on potential of averaged broker value, log-normalized wrt to max drawdown and target bounds.
    #     norm_broker_value = np.asarray(self.broker_stat['value']) / 2 + 1 # shift [-1,1] -> [0,1]
    #     f2 = self.p.gamma * np.log(np.average(norm_broker_value[1:])) - np.log(np.average(norm_broker_value[:-1]))
    #
    #     debug['f2'] = f2
    #
    #     # Potential-based shaping function 3: NOT USED
    #     # negative potential of abs. size of position, exponentially weighted wrt. episode steps
    #     # abs_exposure = np.abs(np.asarray(self.broker_stat['exposure']))
    #     # time = np.asarray(self.broker_stat['episode_step'])
    #     # #time_w = exp_scale(np.average(time[:-1]), gamma=5)
    #     # #time_w_prime = exp_scale(np.average(time[1:]), gamma=5)
    #     # #f3 = - 1.0 * time_w_prime * np.average(abs_exposure[1:]) #+ time_w * np.average(abs_exposure[:-1])
    #     # f3 = - self.p.gamma * exp_scale(time[-1], gamma=3) * abs_exposure[-1] + \
    #     #      exp_scale(time[-2], gamma=3) * abs_exposure[-2]
    #     # debug['f3'] = f3
    #     f3 = 1
    #
    #     # `Spike` reward function: normalized realized profit/loss:
    #     realized_pnl = self.broker_stat['realized_pnl'][-1]
    #     debug['f_real_pnl'] = 10 * realized_pnl
    #
    #     # Weights are subject to tune:
    #     self.reward = (1.0 * f1 + 2.0 * f2 + 0.0 * f3 + 10.0 * realized_pnl) * self.p.reward_scale
    #
    #     debug['r'] = self.reward
    #     debug['b_v'] = self.broker_stat['value'][-1]
    #     debug['unreal_pnl'] = self.broker_stat['unrealized_pnl'][-1]
    #     debug['iteration'] = self.iteration
    #
    #     #for k, v in debug.items():
    #     #    print('{}: {}'.format(k, v))
    #     #print('\n')
    #
    #     # ------ignore-----:
    #     # 'Do-not-expose-for-too-long' shaping term:
    #     # - 1.0 * self.exp_scale(avg_norm_position_duration, gamma=3)
    #
    #     self.reward = np.clip(self.reward, -self.p.reward_scale, self.p.reward_scale)
    #
    #     return self.reward


[docs]class DevStrat_4_11(DevStrat_4_10):
    """
    4_10 + Another set of sma-features, grads for broker state
    """
    # Time embedding period:
    time_dim = 30  # NOTE: changed this --> change Policy  UNREAL for aux. pix control task upsampling params

    # Number of environment steps to skip before returning next response,
    # e.g. if set to 10 -- agent will interact with environment every 10th step;
    # every other step agent action is assumed to be 'hold':
    skip_frame = 10

    # Number of timesteps reward estimation statistics are averaged over, should be:
    # skip_frame_period <= avg_period <= time_embedding_period:
    avg_period = 20

    # Possible agent actions:
    portfolio_actions = ('hold', 'buy', 'sell', 'close')

    gamma = 0.99  # fi_gamma, should be MDP gamma decay

    reward_scale = 1  # reward multiplicator

    state_ext_scale = np.linspace(3e3, 1e3, num=5)

    params = dict(
        # Note: fake `Width` dimension to use 2d conv etc.:
        state_shape=
        {
            'external': spaces.Box(low=-100, high=100, shape=(time_dim, 1, 5), dtype=np.float32),
            'internal': spaces.Box(low=-2, high=2, shape=(avg_period, 1, 6), dtype=np.float32),
            'metadata': DictSpace(
                {
                    'type': spaces.Box(
                        shape=(),
                        low=0,
                        high=1,
                        dtype=np.uint32
                    ),
                    'trial_num': spaces.Box(
                        shape=(),
                        low=0,
                        high=10 ** 10,
                        dtype=np.uint32
                    ),
                    'trial_type': spaces.Box(
                        shape=(),
                        low=0,
                        high=1,
                        dtype=np.uint32
                    ),
                    'sample_num': spaces.Box(
                        shape=(),
                        low=0,
                        high=10 ** 10,
                        dtype=np.uint32
                    ),
                    'first_row': spaces.Box(
                        shape=(),
                        low=0,
                        high=10 ** 10,
                        dtype=np.uint32
                    ),
                    'timestamp': spaces.Box(
                        shape=(),
                        low=0,
                        high=np.finfo(np.float64).max,
                        dtype=np.float64
                    ),
                }
            )
        },
        cash_name='default_cash',
        asset_names=['default_asset'],
        start_cash=None,
        commission=None,
        leverage=1.0,
        drawdown_call=5,
        target_call=19,
        portfolio_actions=portfolio_actions,
        initial_action=None,
        initial_portfolio_action=None,
        skip_frame=skip_frame,
        gamma=gamma,
        reward_scale=1.0,
        state_ext_scale=state_ext_scale,  # EURUSD
        state_int_scale=1.0,
        metadata={},
    )

    def set_datalines(self):
        self.data.sma_16 = btind.SimpleMovingAverage(self.datas[0], period=16)
        self.data.sma_32 = btind.SimpleMovingAverage(self.datas[0], period=32)
        self.data.sma_64 = btind.SimpleMovingAverage(self.datas[0], period=64)
        self.data.sma_128 = btind.SimpleMovingAverage(self.datas[0], period=128)
        self.data.sma_256 = btind.SimpleMovingAverage(self.datas[0], period=256)

        self.data.dim_sma = btind.SimpleMovingAverage(
            self.datas[0],
            period=(256 + self.time_dim)
        )
        self.data.dim_sma.plotinfo.plot = False

    def get_external_state(self):

        x_sma = np.stack(
            [
                np.frombuffer(self.data.sma_16.get(size=self.time_dim)),
                np.frombuffer(self.data.sma_32.get(size=self.time_dim)),
                np.frombuffer(self.data.sma_64.get(size=self.time_dim)),
                np.frombuffer(self.data.sma_128.get(size=self.time_dim)),
                np.frombuffer(self.data.sma_256.get(size=self.time_dim)),
            ],
            axis=-1
        )
        # Gradient along features axis:
        dx = np.gradient(x_sma, axis=-1) * self.p.state_ext_scale

        x = tanh(dx)

        return x[:, None, :]

    def get_internal_state(self):

        x_broker = np.concatenate(
            [
                np.asarray(self.broker_stat['value'])[..., None],
                np.asarray(self.broker_stat['unrealized_pnl'])[..., None],
                np.asarray(self.broker_stat['realized_pnl'])[..., None],
                np.asarray(self.broker_stat['cash'])[..., None],
                np.asarray(self.broker_stat['exposure'])[..., None],
                np.asarray(self.broker_stat['pos_direction'])[..., None],

                # np.asarray(self.broker_stat['value'])[-self.p.skip_frame:, None],
                # np.asarray(self.broker_stat['unrealized_pnl'])[-self.p.skip_frame:, None],
                # np.asarray(self.broker_stat['realized_pnl'])[-self.p.skip_frame:, None],
                # np.asarray(self.broker_stat['cash'])[-self.p.skip_frame:, None],
                # np.asarray(self.broker_stat['exposure'])[-self.p.skip_frame:, None],
                # np.asarray(self.broker_stat['pos_direction'])[-self.p.skip_frame:, None],
            ],
            axis=-1
        )
        x_broker = tanh(np.gradient(x_broker, axis=-1) * self.p.state_int_scale)
        # return x_broker[:, None, :]
        return np.clip(x_broker[:, None, :], -2, 2)


class DevStrat_4_11_1(DevStrat_4_11):
    # Time embedding period:
    time_dim = 30  # NOTE: changed this --> change Policy  UNREAL for aux. pix control task upsampling params
    # Number of environment steps to skip before returning next response,
    # e.g. if set to 10 -- agent will interact with environment every 10th step;
    # every other step agent action is assumed to be 'hold':
    skip_frame = 10
    # Number of timesteps reward estimation statistics are averaged over, should be:
    # skip_frame_period <= avg_period <= time_embedding_period:
    avg_period = 20
    # Possible agent actions:
    portfolio_actions = ('hold', 'buy', 'sell', 'close')
    gamma = 0.99  # fi_gamma, should be MDP gamma decay
    reward_scale = 1  # reward multiplicator
    state_ext_scale = np.linspace(3e3, 1e3, num=5)
    params = dict(
        # Note: fake `Width` dimension to use 2d conv etc.:
        state_shape=
        {
            'external': DictSpace(
                {
                    'diff': spaces.Box(low=-100, high=100, shape=(time_dim, 1, 5), dtype=np.float32),
                    'avg': spaces.Box(low=-100, high=100, shape=(time_dim, 1, 5), dtype=np.float32),
                }
            ),
            'internal': spaces.Box(low=-2, high=2, shape=(avg_period, 1, 6), dtype=np.float32),
            'metadata': DictSpace(
                {
                    'type': spaces.Box(
                        shape=(),
                        low=0,
                        high=1,
                        dtype=np.uint32
                    ),
                    'trial_num': spaces.Box(
                        shape=(),
                        low=0,
                        high=10 ** 10,
                        dtype=np.uint32
                    ),
                    'trial_type': spaces.Box(
                        shape=(),
                        low=0,
                        high=1,
                        dtype=np.uint32
                    ),
                    'sample_num': spaces.Box(
                        shape=(),
                        low=0,
                        high=10 ** 10,
                        dtype=np.uint32
                    ),
                    'first_row': spaces.Box(
                        shape=(),
                        low=0,
                        high=10 ** 10,
                        dtype=np.uint32
                    ),
                    'timestamp': spaces.Box(
                        shape=(),
                        low=0,
                        high=np.finfo(np.float64).max,
                        dtype=np.float64
                    ),
                }
            )
        },
        cash_name='default_cash',
        asset_names=['default_asset'],
        start_cash=None,
        commission=None,
        leverage=1.0,
        drawdown_call=5,
        target_call=19,
        portfolio_actions=portfolio_actions,
        initial_action=None,
        initial_portfolio_action=None,
        skip_frame=skip_frame,
        gamma=gamma,
        reward_scale=1.0,
        state_ext_scale=state_ext_scale,  # EURUSD
        state_int_scale=1.0,
        metadata={},
    )

    def get_external_state(self):
        x_sma = np.stack(
            [
                np.frombuffer(self.data.sma_16.get(size=self.time_dim)),
                np.frombuffer(self.data.sma_32.get(size=self.time_dim)),
                np.frombuffer(self.data.sma_64.get(size=self.time_dim)),
                np.frombuffer(self.data.sma_128.get(size=self.time_dim)),
                np.frombuffer(self.data.sma_256.get(size=self.time_dim)),
            ],
            axis=-1
        )
        # Gradient along features axis:
        diff = np.gradient(x_sma, axis=-1) * self.p.state_ext_scale
        diff = tanh(diff)
        avg = np.gradient(x_sma, axis=0) * self.p.state_ext_scale
        avg = tanh(avg)

        return {'avg': avg[:, None, :], 'diff': diff[:, None, :]}

[docs]class DevStrat_4_12(DevStrat_4_11):
    """
    4_11 + sma-features 8, 512;
    """
    # Time embedding period:
    time_dim = 30  # NOTE: changed this --> change Policy  UNREAL for aux. pix control task upsampling params

    # Hyperparameters for estimating signal features:
    features_parameters = [8, 16, 32, 64, 128, 256]
    num_features = len(features_parameters)

    # Number of environment steps to skip before returning next response,
    # e.g. if set to 10 -- agent will interact with environment every 10th step;
    # every other step agent action is assumed to be 'hold':
    skip_frame = 10

    # Number of timesteps reward estimation statistics are averaged over, should be:
    # skip_frame_period <= avg_period <= time_embedding_period:
    avg_period = 20

    # Possible agent actions:
    portfolio_actions = ('hold', 'buy', 'sell', 'close')

    gamma = 0.99  # fi_gamma, should be MDP gamma decay

    reward_scale = 1  # reward multiplicator

    state_ext_scale = np.linspace(3e3, 1e3, num=num_features)

    params = dict(
        # Note: fake `Width` dimension to use 2d conv etc.:
        state_shape=
        {
            'external': spaces.Box(low=-100, high=100, shape=(time_dim, 1, num_features), dtype=np.float32),
            'internal': spaces.Box(low=-2, high=2, shape=(avg_period, 1, 5), dtype=np.float32),
            'datetime': spaces.Box(low=0, high=1, shape=(1, 5), dtype=np.float32),
            'metadata': DictSpace(
                {
                    'type': spaces.Box(
                        shape=(),
                        low=0,
                        high=1,
                        dtype=np.uint32
                    ),
                    'trial_num': spaces.Box(
                        shape=(),
                        low=0,
                        high=10 ** 10,
                        dtype=np.uint32
                    ),
                    'trial_type': spaces.Box(
                        shape=(),
                        low=0,
                        high=1,
                        dtype=np.uint32
                    ),
                    'sample_num': spaces.Box(
                        shape=(),
                        low=0,
                        high=10 ** 10,
                        dtype=np.uint32
                    ),
                    'first_row': spaces.Box(
                        shape=(),
                        low=0,
                        high=10 ** 10,
                        dtype=np.uint32
                    ),
                    'timestamp': spaces.Box(
                        shape=(),
                        low=0,
                        high=np.finfo(np.float64).max,
                        dtype=np.float64
                    ),
                }
            )
        },
        cash_name='default_cash',
        asset_names=['default_asset'],
        start_cash=None,
        commission=None,
        leverage=1.0,
        drawdown_call=5,
        target_call=19,
        portfolio_actions=portfolio_actions,
        initial_action=None,
        initial_portfolio_action=None,
        skip_frame=skip_frame,
        state_ext_scale=state_ext_scale,  # EURUSD
        state_int_scale=1.0,
        gamma=gamma,
        reward_scale=1.0,
        metadata={},
    )

    def set_datalines(self):
        self.data.features = [
            btind.SimpleMovingAverage(self.datas[0], period=period) for period in self.features_parameters
        ]

        self.data.dim_sma = btind.SimpleMovingAverage(
            self.datas[0],
            period=(np.asarray(self.features_parameters).max() + self.time_dim)
        )
        self.data.dim_sma.plotinfo.plot = False

    def get_external_state(self):

        x_sma = np.stack(
            [
                feature.get(size=self.time_dim) for feature in self.data.features
            ],
            axis=-1
        )
        # Gradient along features axis:
        dx = np.gradient(x_sma, axis=-1) * self.p.state_ext_scale

        # In [-1,1]:
        x = tanh(dx)
        return x[:, None, :]

    def get_internal_state(self):

        x_broker = np.concatenate(
            [
                np.asarray(self.broker_stat['value'])[..., None],
                np.asarray(self.broker_stat['unrealized_pnl'])[..., None],
                np.asarray(self.broker_stat['realized_pnl'])[..., None],
                np.asarray(self.broker_stat['cash'])[..., None],
                np.asarray(self.broker_stat['exposure'])[..., None],
            ],
            axis=-1
        )
        x_broker = tanh(np.gradient(x_broker, axis=-1) * self.p.state_int_scale)

        return x_broker[:, None, :]

    def get_datetime_state(self):
        time = self.data.datetime.time()
        date = self.data.datetime.date()

        # Encode in [0, 1]:
        mn = date.month / 12
        wd = date.weekday() / 6
        d = date.day / 31
        h = time.hour / 24
        mm = time.minute / 60

        encoded_stamp = [mn, d, wd, h, mm]
        return np.asarray(encoded_stamp)[None, :]