Source code for btgym.research.strategy_gen_4

import numpy as np
from scipy.stats import zscore

import backtrader as bt
import backtrader.indicators as btind

from btgym.strategy.base import BTgymBaseStrategy
from btgym.strategy.utils import tanh, abs_norm_ratio, exp_scale, discounted_average, log_transform

from gym import spaces
from btgym import DictSpace

"""
Research grade code. Can be unstable, buggy, poor performing and generally is subject to change.
"""


[docs]class DevStrat_4_6(BTgymBaseStrategy): """ Objectives: external state data feature search: time_embedded three-channeled vector: - `Open` channel is one time-step difference of Open price; - `High` and `Low` channels are differences between current Open price and current High or Low prices respectively internal state data feature search: time_embedded concatenated vector of broker and portfolio statistics time_embedded vector of last actions recieved (one-hot) time_embedded vector of rewards reward shaping search: potential-based shaping functions Data: synthetic/real """ # Time embedding period: time_dim = 30 # NOTE: changed this --> change Policy UNREAL for aux. pix control task upsampling params # Number of environment steps to skip before returning next response, # e.g. if set to 10 -- agent will interact with environment every 10th step; # every other step agent action is assumed to be 'hold': skip_frame = 10 # Number of timesteps reward estimation statistics are averaged over, should be: # skip_frame_period <= avg_period <= time_embedding_period: avg_period = time_dim # Possible agent actions: portfolio_actions = ('hold', 'buy', 'sell', 'close') params = dict( # Note: fake `Width` dimension to use 2d conv etc.: state_shape= { 'external': spaces.Box(low=-1, high=1, shape=(time_dim, 1, 3), dtype=np.float32), 'internal': spaces.Box(low=-2, high=2, shape=(avg_period, 1, 5), dtype=np.float32), 'metadata': DictSpace( { 'type': spaces.Box( shape=(), low=0, high=1, dtype=np.uint32 ), 'trial_num': spaces.Box( shape=(), low=0, high=10**10, dtype=np.uint32 ), 'trial_type': spaces.Box( shape=(), low=0, high=1, dtype=np.uint32 ), 'sample_num': spaces.Box( shape=(), low=0, high=10**10, dtype=np.uint32 ), 'first_row': spaces.Box( shape=(), low=0, high=10**10, dtype=np.uint32 ), 'timestamp': spaces.Box( shape=(), low=0, high=np.finfo(np.float64).max, dtype=np.float64 ), } ) }, cash_name='default_cash', asset_names=['default_asset'], start_cash=None, commission=None, leverage=1.0, drawdown_call=5, target_call=19, portfolio_actions=portfolio_actions, initial_action=None, initial_portfolio_action=None, skip_frame=skip_frame, state_ext_scale=2e3, # EURUSD state_int_scale=1.0, # not used metadata={} ) def __init__(self, **kwargs): """ Args: **kwargs: see BTgymBaseStrategy args. """ super(DevStrat_4_6, self).__init__(**kwargs) self.state['metadata'] = self.metadata self.log.debug('DEV_state_shape: {}'.format(self.p.state_shape)) self.log.debug('DEV_skip_frame: {}'.format(self.p.skip_frame)) self.log.debug('DEV_portfolio_actions: {}'.format(self.p.portfolio_actions)) self.log.debug('DEV_drawdown_call: {}'.format(self.p.drawdown_call)) self.log.debug('DEV_target_call: {}'.format(self.p.target_call)) self.log.debug('DEV_dataset_stat:\n{}'.format(self.p.dataset_stat)) self.log.debug('DEV_episode_stat:\n{}'.format(self.p.episode_stat)) def set_datalines(self): # Define data channels: self.channel_O = bt.Sum(self.data.open, - self.data.open(-1)) self.channel_H = bt.Sum(self.data.high, - self.data.open) self.channel_L = bt.Sum(self.data.low, - self.data.open) def get_external_state(self): x = np.stack( [ np.frombuffer(self.channel_O.get(size=self.time_dim)), np.frombuffer(self.channel_H.get(size=self.time_dim)), np.frombuffer(self.channel_L.get(size=self.time_dim)), ], axis=-1 ) # Amplify and squash in [-1,1], seems to be best option as of 4.10.17: # `self.p.state_ext_scale` param is supposed to keep most of the signal # in 'linear' part of tanh while squashing spikes. x_market = tanh(x * self.p.state_ext_scale) return x_market[:, None, :]
[docs]class DevStrat_4_7(DevStrat_4_6): """ 4_6 + Sliding statistics avg_period disentangled from time embedding dim; Only one last step sliding stats are used for internal state; Reward weights: 1, 2, 10 , reward scale factor aded; """ # Time embedding period: time_dim = 30 # NOTE: changed this --> change Policy UNREAL for aux. pix control task upsampling params # Number of environment steps to skip before returning next response, # e.g. if set to 10 -- agent will interact with environment every 10th step; # every other step agent action is assumed to be 'hold': skip_frame = 10 # Number of timesteps reward estimation statistics are averaged over, should be: # skip_frame_period <= avg_period <= time_embedding_period: avg_period = 20 # Possible agent actions: portfolio_actions = ('hold', 'buy', 'sell', 'close') gamma = 1.0 # fi_gamma, should be MDP gamma decay reward_scale = 1.0 # reward scaler params = dict( # Note: fake `Width` dimension to use 2d conv etc.: state_shape= { 'external': spaces.Box(low=-1, high=1, shape=(time_dim, 1, 3), dtype=np.float32), 'internal': spaces.Box(low=-2, high=2, shape=(1, 1, 5), dtype=np.float32), 'metadata': DictSpace( { 'type': spaces.Box( shape=(), low=0, high=1, dtype=np.uint32 ), 'trial_num': spaces.Box( shape=(), low=0, high=10 ** 10, dtype=np.uint32 ), 'trial_type': spaces.Box( shape=(), low=0, high=1, dtype=np.uint32 ), 'sample_num': spaces.Box( shape=(), low=0, high=10 ** 10, dtype=np.uint32 ), 'first_row': spaces.Box( shape=(), low=0, high=10 ** 10, dtype=np.uint32 ), 'timestamp': spaces.Box( shape=(), low=0, high=np.finfo(np.float64).max, dtype=np.float64 ), } ) }, cash_name='default_cash', asset_names=['default_asset'], start_cash=None, commission=None, leverage=1.0, drawdown_call=5, target_call=19, portfolio_actions=portfolio_actions, initial_action=None, initial_portfolio_action=None, skip_frame=skip_frame, gamma=gamma, reward_scale=1.0, state_ext_scale=2e3, # EURUSD state_int_scale=1.0, # not used metadata={} ) def __init__(self, **kwargs): super(DevStrat_4_7, self).__init__(**kwargs) def get_internal_state(self): x_broker = np.stack( [ self.broker_stat['value'][-1], self.broker_stat['unrealized_pnl'][-1], self.broker_stat['realized_pnl'][-1], self.broker_stat['cash'][-1], self.broker_stat['exposure'][-1], ] ) return x_broker[None, None, :]
[docs]class DevStrat_4_8(DevStrat_4_7): """ 4_7 + Uses full average_period of inner stats for use with inner_conv_encoder. """ # Time embedding period: time_dim = 30 # NOTE: changed this --> change Policy UNREAL for aux. pix control task upsampling params # Number of environment steps to skip before returning next response, # e.g. if set to 10 -- agent will interact with environment every 10th step; # every other step agent action is assumed to be 'hold': skip_frame = 10 # Number of timesteps reward estimation statistics are averaged over, should be: # skip_frame_period <= avg_period <= time_embedding_period: # !..-> here it is also `broker state` time-embedding period avg_period = 20 # Possible agent actions: portfolio_actions = ('hold', 'buy', 'sell', 'close') gamma = 1.0 # fi_gamma, should be MDP gamma decay, but somehow undiscounted works better <- wtf? reward_scale = 1 # reward multiplicator params = dict( # Note: fake `Width` dimension to use 2d conv etc.: state_shape= { 'external': spaces.Box(low=-1, high=1, shape=(time_dim, 1, 3), dtype=np.float32), 'internal': spaces.Box(low=-2, high=2, shape=(avg_period, 1, 5), dtype=np.float32), 'metadata': DictSpace( { 'type': spaces.Box( shape=(), low=0, high=1, dtype=np.uint32 ), 'trial_num': spaces.Box( shape=(), low=0, high=10 ** 10, dtype=np.uint32 ), 'trial_type': spaces.Box( shape=(), low=0, high=1, dtype=np.uint32 ), 'sample_num': spaces.Box( shape=(), low=0, high=10 ** 10, dtype=np.uint32 ), 'first_row': spaces.Box( shape=(), low=0, high=10 ** 10, dtype=np.uint32 ), 'timestamp': spaces.Box( shape=(), low=0, high=np.finfo(np.float64).max, dtype=np.float64 ), } ) }, cash_name='default_cash', asset_names=['default_asset'], start_cash=None, commission=None, leverage=1.0, drawdown_call=5, target_call=19, portfolio_actions=portfolio_actions, initial_action=None, initial_portfolio_action=None, skip_frame=skip_frame, gamma=gamma, reward_scale=1.0, state_ext_scale=2e3, # EURUSD state_int_scale=1.0, # not used metadata={}, ) def get_internal_state(self): x_broker = np.concatenate( [ np.asarray(self.broker_stat['value'])[..., None], np.asarray(self.broker_stat['unrealized_pnl'])[..., None], np.asarray(self.broker_stat['realized_pnl'])[..., None], np.asarray(self.broker_stat['cash'])[..., None], np.asarray(self.broker_stat['exposure'])[..., None], # np.asarray(self.sliding_stat['episode_step'])[..., None], # np.asarray(self.sliding_stat['reward'])[..., None], # np.asarray(self.sliding_stat['action'])[..., None], # norm_position_duration[...,None], # max_unrealized_pnl[..., None], # min_unrealized_pnl[..., None], ], axis=-1 ) return x_broker[:, None, :]
[docs]class DevStrat_4_9(DevStrat_4_7): """ 4_7 + Uses simple SMA market state features. """ # Time embedding period: time_dim = 30 # NOTE: changed this --> change Policy UNREAL for aux. pix control task upsampling params # Number of environment steps to skip before returning next response, # e.g. if set to 10 -- agent will interact with environment every 10th step; # every other step agent action is assumed to be 'hold': skip_frame = 10 # Number of timesteps reward estimation statistics are averaged over, should be: # skip_frame_period <= avg_period <= time_embedding_period: avg_period = 20 # Possible agent actions: portfolio_actions = ('hold', 'buy', 'sell', 'close') gamma = 1.0 # fi_gamma, should be MDP gamma decay reward_scale = 1 # reward multiplicator, touchy! params = dict( # Note: fake `Width` dimension to use 2d conv etc.: state_shape= { 'external': spaces.Box(low=-100, high=100, shape=(time_dim, 1, 8), dtype=np.float32), 'internal': spaces.Box(low=-2, high=2, shape=(1, 1, 5), dtype=np.float32), 'metadata': DictSpace( { 'type': spaces.Box( shape=(), low=0, high=1, dtype=np.uint32 ), 'trial_num': spaces.Box( shape=(), low=0, high=10 ** 10, dtype=np.uint32 ), 'trial_type': spaces.Box( shape=(), low=0, high=1, dtype=np.uint32 ), 'sample_num': spaces.Box( shape=(), low=0, high=10 ** 10, dtype=np.uint32 ), 'first_row': spaces.Box( shape=(), low=0, high=10 ** 10, dtype=np.uint32 ), 'timestamp': spaces.Box( shape=(), low=0, high=np.finfo(np.float64).max, dtype=np.float64 ), } ) }, cash_name='default_cash', asset_names=['default_asset'], start_cash=None, commission=None, leverage=1.0, drawdown_call=5, target_call=19, portfolio_actions=portfolio_actions, initial_action=None, initial_portfolio_action=None, skip_frame=skip_frame, gamma=gamma, reward_scale=1.0, state_ext_scale=1e4, # EURUSD state_int_scale=1.0, # not used metadata={}, ) def set_datalines(self): self.data.sma_4 = btind.SimpleMovingAverage(self.datas[0], period=4) self.data.sma_8 = btind.SimpleMovingAverage(self.datas[0], period=8) self.data.sma_16 = btind.SimpleMovingAverage(self.datas[0], period=16) self.data.sma_32 = btind.SimpleMovingAverage(self.datas[0], period=32) self.data.sma_64 = btind.SimpleMovingAverage(self.datas[0], period=64) self.data.sma_128 = btind.SimpleMovingAverage(self.datas[0], period=128) self.data.sma_256 = btind.SimpleMovingAverage(self.datas[0], period=256) self.data.dim_sma = btind.SimpleMovingAverage( self.datas[0], period=(256 + self.time_dim) ) self.data.dim_sma.plotinfo.plot = False def get_external_state(self): x = np.stack( [ np.frombuffer(self.data.open.get(size=self.time_dim)), np.frombuffer(self.data.sma_4.get(size=self.time_dim)), np.frombuffer(self.data.sma_8.get(size=self.time_dim)), np.frombuffer(self.data.sma_16.get(size=self.time_dim)), np.frombuffer(self.data.sma_32.get(size=self.time_dim)), np.frombuffer(self.data.sma_64.get(size=self.time_dim)), np.frombuffer(self.data.sma_128.get(size=self.time_dim)), np.frombuffer(self.data.sma_256.get(size=self.time_dim)), ], axis=-1 ) # Gradient along features axis: x = np.gradient(x, axis=1) * self.p.state_ext_scale # Log-scale: x = log_transform(x) return x[:, None, :]
[docs]class DevStrat_4_10(DevStrat_4_7): """ 4_7 + Reward search: log-normalised potential functions. Nope. """
# def get_reward(self): # """ # Shapes reward function as normalized single trade realized profit/loss, # augmented with potential-based reward shaping functions in form of: # F(s, a, s`) = gamma * FI(s`) - FI(s); # # - potential FI_1 is current normalized unrealized profit/loss; # - potential FI_2 is current normalized broker value. # - FI_3: penalizing exposure toward the end of episode # # Paper: # "Policy invariance under reward transformations: # Theory and application to reward shaping" by A. Ng et al., 1999; # http://www.robotics.stanford.edu/~ang/papers/shaping-icml99.pdf # """ # # # All sliding statistics for this step are already updated by get_state(). # debug = {} # scale = 10.0 # # Potential-based shaping function 1: # # based on log potential of averaged profit/loss for current opened trade (unrealized p/l): # unrealised_pnl = np.asarray(self.broker_stat['unrealized_pnl']) / 2 + 1 # shift [-1,1] -> [0,1] # # TODO: make normalizing util func to return in [0,1] by default # f1 = self.p.gamma * np.log(np.average(unrealised_pnl[1:])) - np.log(np.average(unrealised_pnl[:-1])) # # debug['f1'] = f1 # # # Potential-based shaping function 2: # # based on potential of averaged broker value, log-normalized wrt to max drawdown and target bounds. # norm_broker_value = np.asarray(self.broker_stat['value']) / 2 + 1 # shift [-1,1] -> [0,1] # f2 = self.p.gamma * np.log(np.average(norm_broker_value[1:])) - np.log(np.average(norm_broker_value[:-1])) # # debug['f2'] = f2 # # # Potential-based shaping function 3: NOT USED # # negative potential of abs. size of position, exponentially weighted wrt. episode steps # # abs_exposure = np.abs(np.asarray(self.broker_stat['exposure'])) # # time = np.asarray(self.broker_stat['episode_step']) # # #time_w = exp_scale(np.average(time[:-1]), gamma=5) # # #time_w_prime = exp_scale(np.average(time[1:]), gamma=5) # # #f3 = - 1.0 * time_w_prime * np.average(abs_exposure[1:]) #+ time_w * np.average(abs_exposure[:-1]) # # f3 = - self.p.gamma * exp_scale(time[-1], gamma=3) * abs_exposure[-1] + \ # # exp_scale(time[-2], gamma=3) * abs_exposure[-2] # # debug['f3'] = f3 # f3 = 1 # # # `Spike` reward function: normalized realized profit/loss: # realized_pnl = self.broker_stat['realized_pnl'][-1] # debug['f_real_pnl'] = 10 * realized_pnl # # # Weights are subject to tune: # self.reward = (1.0 * f1 + 2.0 * f2 + 0.0 * f3 + 10.0 * realized_pnl) * self.p.reward_scale # # debug['r'] = self.reward # debug['b_v'] = self.broker_stat['value'][-1] # debug['unreal_pnl'] = self.broker_stat['unrealized_pnl'][-1] # debug['iteration'] = self.iteration # # #for k, v in debug.items(): # # print('{}: {}'.format(k, v)) # #print('\n') # # # ------ignore-----: # # 'Do-not-expose-for-too-long' shaping term: # # - 1.0 * self.exp_scale(avg_norm_position_duration, gamma=3) # # self.reward = np.clip(self.reward, -self.p.reward_scale, self.p.reward_scale) # # return self.reward
[docs]class DevStrat_4_11(DevStrat_4_10): """ 4_10 + Another set of sma-features, grads for broker state """ # Time embedding period: time_dim = 30 # NOTE: changed this --> change Policy UNREAL for aux. pix control task upsampling params # Number of environment steps to skip before returning next response, # e.g. if set to 10 -- agent will interact with environment every 10th step; # every other step agent action is assumed to be 'hold': skip_frame = 10 # Number of timesteps reward estimation statistics are averaged over, should be: # skip_frame_period <= avg_period <= time_embedding_period: avg_period = 20 # Possible agent actions: portfolio_actions = ('hold', 'buy', 'sell', 'close') gamma = 0.99 # fi_gamma, should be MDP gamma decay reward_scale = 1 # reward multiplicator state_ext_scale = np.linspace(3e3, 1e3, num=5) params = dict( # Note: fake `Width` dimension to use 2d conv etc.: state_shape= { 'external': spaces.Box(low=-100, high=100, shape=(time_dim, 1, 5), dtype=np.float32), 'internal': spaces.Box(low=-2, high=2, shape=(avg_period, 1, 6), dtype=np.float32), 'metadata': DictSpace( { 'type': spaces.Box( shape=(), low=0, high=1, dtype=np.uint32 ), 'trial_num': spaces.Box( shape=(), low=0, high=10 ** 10, dtype=np.uint32 ), 'trial_type': spaces.Box( shape=(), low=0, high=1, dtype=np.uint32 ), 'sample_num': spaces.Box( shape=(), low=0, high=10 ** 10, dtype=np.uint32 ), 'first_row': spaces.Box( shape=(), low=0, high=10 ** 10, dtype=np.uint32 ), 'timestamp': spaces.Box( shape=(), low=0, high=np.finfo(np.float64).max, dtype=np.float64 ), } ) }, cash_name='default_cash', asset_names=['default_asset'], start_cash=None, commission=None, leverage=1.0, drawdown_call=5, target_call=19, portfolio_actions=portfolio_actions, initial_action=None, initial_portfolio_action=None, skip_frame=skip_frame, gamma=gamma, reward_scale=1.0, state_ext_scale=state_ext_scale, # EURUSD state_int_scale=1.0, metadata={}, ) def set_datalines(self): self.data.sma_16 = btind.SimpleMovingAverage(self.datas[0], period=16) self.data.sma_32 = btind.SimpleMovingAverage(self.datas[0], period=32) self.data.sma_64 = btind.SimpleMovingAverage(self.datas[0], period=64) self.data.sma_128 = btind.SimpleMovingAverage(self.datas[0], period=128) self.data.sma_256 = btind.SimpleMovingAverage(self.datas[0], period=256) self.data.dim_sma = btind.SimpleMovingAverage( self.datas[0], period=(256 + self.time_dim) ) self.data.dim_sma.plotinfo.plot = False def get_external_state(self): x_sma = np.stack( [ np.frombuffer(self.data.sma_16.get(size=self.time_dim)), np.frombuffer(self.data.sma_32.get(size=self.time_dim)), np.frombuffer(self.data.sma_64.get(size=self.time_dim)), np.frombuffer(self.data.sma_128.get(size=self.time_dim)), np.frombuffer(self.data.sma_256.get(size=self.time_dim)), ], axis=-1 ) # Gradient along features axis: dx = np.gradient(x_sma, axis=-1) * self.p.state_ext_scale x = tanh(dx) return x[:, None, :] def get_internal_state(self): x_broker = np.concatenate( [ np.asarray(self.broker_stat['value'])[..., None], np.asarray(self.broker_stat['unrealized_pnl'])[..., None], np.asarray(self.broker_stat['realized_pnl'])[..., None], np.asarray(self.broker_stat['cash'])[..., None], np.asarray(self.broker_stat['exposure'])[..., None], np.asarray(self.broker_stat['pos_direction'])[..., None], # np.asarray(self.broker_stat['value'])[-self.p.skip_frame:, None], # np.asarray(self.broker_stat['unrealized_pnl'])[-self.p.skip_frame:, None], # np.asarray(self.broker_stat['realized_pnl'])[-self.p.skip_frame:, None], # np.asarray(self.broker_stat['cash'])[-self.p.skip_frame:, None], # np.asarray(self.broker_stat['exposure'])[-self.p.skip_frame:, None], # np.asarray(self.broker_stat['pos_direction'])[-self.p.skip_frame:, None], ], axis=-1 ) x_broker = tanh(np.gradient(x_broker, axis=-1) * self.p.state_int_scale) # return x_broker[:, None, :] return np.clip(x_broker[:, None, :], -2, 2)
class DevStrat_4_11_1(DevStrat_4_11): # Time embedding period: time_dim = 30 # NOTE: changed this --> change Policy UNREAL for aux. pix control task upsampling params # Number of environment steps to skip before returning next response, # e.g. if set to 10 -- agent will interact with environment every 10th step; # every other step agent action is assumed to be 'hold': skip_frame = 10 # Number of timesteps reward estimation statistics are averaged over, should be: # skip_frame_period <= avg_period <= time_embedding_period: avg_period = 20 # Possible agent actions: portfolio_actions = ('hold', 'buy', 'sell', 'close') gamma = 0.99 # fi_gamma, should be MDP gamma decay reward_scale = 1 # reward multiplicator state_ext_scale = np.linspace(3e3, 1e3, num=5) params = dict( # Note: fake `Width` dimension to use 2d conv etc.: state_shape= { 'external': DictSpace( { 'diff': spaces.Box(low=-100, high=100, shape=(time_dim, 1, 5), dtype=np.float32), 'avg': spaces.Box(low=-100, high=100, shape=(time_dim, 1, 5), dtype=np.float32), } ), 'internal': spaces.Box(low=-2, high=2, shape=(avg_period, 1, 6), dtype=np.float32), 'metadata': DictSpace( { 'type': spaces.Box( shape=(), low=0, high=1, dtype=np.uint32 ), 'trial_num': spaces.Box( shape=(), low=0, high=10 ** 10, dtype=np.uint32 ), 'trial_type': spaces.Box( shape=(), low=0, high=1, dtype=np.uint32 ), 'sample_num': spaces.Box( shape=(), low=0, high=10 ** 10, dtype=np.uint32 ), 'first_row': spaces.Box( shape=(), low=0, high=10 ** 10, dtype=np.uint32 ), 'timestamp': spaces.Box( shape=(), low=0, high=np.finfo(np.float64).max, dtype=np.float64 ), } ) }, cash_name='default_cash', asset_names=['default_asset'], start_cash=None, commission=None, leverage=1.0, drawdown_call=5, target_call=19, portfolio_actions=portfolio_actions, initial_action=None, initial_portfolio_action=None, skip_frame=skip_frame, gamma=gamma, reward_scale=1.0, state_ext_scale=state_ext_scale, # EURUSD state_int_scale=1.0, metadata={}, ) def get_external_state(self): x_sma = np.stack( [ np.frombuffer(self.data.sma_16.get(size=self.time_dim)), np.frombuffer(self.data.sma_32.get(size=self.time_dim)), np.frombuffer(self.data.sma_64.get(size=self.time_dim)), np.frombuffer(self.data.sma_128.get(size=self.time_dim)), np.frombuffer(self.data.sma_256.get(size=self.time_dim)), ], axis=-1 ) # Gradient along features axis: diff = np.gradient(x_sma, axis=-1) * self.p.state_ext_scale diff = tanh(diff) avg = np.gradient(x_sma, axis=0) * self.p.state_ext_scale avg = tanh(avg) return {'avg': avg[:, None, :], 'diff': diff[:, None, :]}
[docs]class DevStrat_4_12(DevStrat_4_11): """ 4_11 + sma-features 8, 512; """ # Time embedding period: time_dim = 30 # NOTE: changed this --> change Policy UNREAL for aux. pix control task upsampling params # Hyperparameters for estimating signal features: features_parameters = [8, 16, 32, 64, 128, 256] num_features = len(features_parameters) # Number of environment steps to skip before returning next response, # e.g. if set to 10 -- agent will interact with environment every 10th step; # every other step agent action is assumed to be 'hold': skip_frame = 10 # Number of timesteps reward estimation statistics are averaged over, should be: # skip_frame_period <= avg_period <= time_embedding_period: avg_period = 20 # Possible agent actions: portfolio_actions = ('hold', 'buy', 'sell', 'close') gamma = 0.99 # fi_gamma, should be MDP gamma decay reward_scale = 1 # reward multiplicator state_ext_scale = np.linspace(3e3, 1e3, num=num_features) params = dict( # Note: fake `Width` dimension to use 2d conv etc.: state_shape= { 'external': spaces.Box(low=-100, high=100, shape=(time_dim, 1, num_features), dtype=np.float32), 'internal': spaces.Box(low=-2, high=2, shape=(avg_period, 1, 5), dtype=np.float32), 'datetime': spaces.Box(low=0, high=1, shape=(1, 5), dtype=np.float32), 'metadata': DictSpace( { 'type': spaces.Box( shape=(), low=0, high=1, dtype=np.uint32 ), 'trial_num': spaces.Box( shape=(), low=0, high=10 ** 10, dtype=np.uint32 ), 'trial_type': spaces.Box( shape=(), low=0, high=1, dtype=np.uint32 ), 'sample_num': spaces.Box( shape=(), low=0, high=10 ** 10, dtype=np.uint32 ), 'first_row': spaces.Box( shape=(), low=0, high=10 ** 10, dtype=np.uint32 ), 'timestamp': spaces.Box( shape=(), low=0, high=np.finfo(np.float64).max, dtype=np.float64 ), } ) }, cash_name='default_cash', asset_names=['default_asset'], start_cash=None, commission=None, leverage=1.0, drawdown_call=5, target_call=19, portfolio_actions=portfolio_actions, initial_action=None, initial_portfolio_action=None, skip_frame=skip_frame, state_ext_scale=state_ext_scale, # EURUSD state_int_scale=1.0, gamma=gamma, reward_scale=1.0, metadata={}, ) def set_datalines(self): self.data.features = [ btind.SimpleMovingAverage(self.datas[0], period=period) for period in self.features_parameters ] self.data.dim_sma = btind.SimpleMovingAverage( self.datas[0], period=(np.asarray(self.features_parameters).max() + self.time_dim) ) self.data.dim_sma.plotinfo.plot = False def get_external_state(self): x_sma = np.stack( [ feature.get(size=self.time_dim) for feature in self.data.features ], axis=-1 ) # Gradient along features axis: dx = np.gradient(x_sma, axis=-1) * self.p.state_ext_scale # In [-1,1]: x = tanh(dx) return x[:, None, :] def get_internal_state(self): x_broker = np.concatenate( [ np.asarray(self.broker_stat['value'])[..., None], np.asarray(self.broker_stat['unrealized_pnl'])[..., None], np.asarray(self.broker_stat['realized_pnl'])[..., None], np.asarray(self.broker_stat['cash'])[..., None], np.asarray(self.broker_stat['exposure'])[..., None], ], axis=-1 ) x_broker = tanh(np.gradient(x_broker, axis=-1) * self.p.state_int_scale) return x_broker[:, None, :] def get_datetime_state(self): time = self.data.datetime.time() date = self.data.datetime.date() # Encode in [0, 1]: mn = date.month / 12 wd = date.weekday() / 6 d = date.day / 31 h = time.hour / 24 mm = time.minute / 60 encoded_stamp = [mn, d, wd, h, mm] return np.asarray(encoded_stamp)[None, :]