###############################################################################
#
# Copyright (C) 2017-2018 Andrew Muzikin
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
###############################################################################
from logbook import WARNING
from .base import BTgymBaseData
import datetime
[docs]class BTgymEpisode(BTgymBaseData):
"""
Low-level data class.
Implements `Episode` object containing single episode data sequence.
Doesnt allows further sampling and data loading.
Supposed to be converted to bt.datafeed object via .to_btfeed() method.
Do not use directly.
"""
def __init__(
self,
filename=None,
parsing_params=None,
sampling_params=None,
name=None,
data_names=('default_asset',),
task=0,
log_level=WARNING,
_config_stack=None,
):
super(BTgymEpisode, self).__init__(
filename=filename,
parsing_params=parsing_params,
sampling_params=None,
name='episode',
task=task,
data_names=data_names,
log_level=log_level,
_config_stack=_config_stack
)
def reset(self, **kwargs):
raise RuntimeError('Episode object doesnt support .reset() method.')
def sample(self, **kwargs):
raise RuntimeError('Episode object doesnt support .sample() method.')
[docs]class BTgymDataTrial(BTgymBaseData):
"""
Intermediate-level data class.
Implements conception of `Trial` object.
Supports data train/test separation.
Do not use directly.
"""
trial_params = dict(
nested_class_ref=BTgymEpisode,
)
def __init__(
self,
filename=None,
parsing_params=None,
sampling_params=None,
name=None,
data_names=('default_asset',),
frozen_time_split=None,
task=0,
log_level=WARNING,
_config_stack=None,
):
"""
Args:
filename: not used;
sampling_params: dict, sample retrieving options, see base class description for details;
task: int, optional;
parsing_params: csv parsing options, see base class description for details;
log_level: int, optional, logbook.level;
_config_stack: dict, holding configuration for nested child samples;
"""
super(BTgymDataTrial, self).__init__(
filename=filename,
parsing_params=parsing_params,
sampling_params=sampling_params,
name='Trial',
data_names=data_names,
frozen_time_split=frozen_time_split,
task=task,
log_level=log_level,
_config_stack=_config_stack
)
[docs]class BTgymRandomDataDomain(BTgymBaseData):
"""
Top-level data class. Implements one way data domains can be defined,
namely when source domain precedes and target one. Implements pipe::
Domain.sample() --> Trial.sample() --> Episode.to_btfeed() --> bt.Startegy
This particular class randomly samples Trials from provided dataset.
"""
# Classes to use for sample objects:
trial_class_ref = BTgymDataTrial
episode_class_ref = BTgymEpisode
def __init__(
self,
trial_params,
episode_params,
filename=None,
dataframe=None,
parsing_params=None,
target_period=None,
use_target_backshift=False,
frozen_time_split=None,
name='RndDataDomain',
task=0,
data_names=('default_asset',),
log_level=WARNING,
):
"""
Args:
filename: Str or list of str, file_names containing CSV historic data;
dataframe: pd.dataframe or iterable of pd.dataframes containing historic data;
parsing_params: csv parsing options, see base class description for details;
trial_params: dict, describes trial parameters, should contain keys:
{sample_duration, time_gap, start_00, start_weekdays, test_period, expanding};
episode_params: dict, describes episode parameters, should contain keys:
{sample_duration, time_gap, start_00, start_weekdays};
target_period: dict, None or Int, domain target period, def={'days': 0, 'hours': 0, 'minutes': 0};
setting this param to non-zero duration forces separation to source/target
domains (which can be thought of as creating top-level train/test subsets) with
target data duration equal to `target_period`;
if set to None - no target period assumed;
if set to -1 - no source period assumed;
Source data always precedes target one.
use_target_backshift: bool, if true - target domain is shifted back by the duration of trial train period,
thus allowing training on part of target domain data,
namely train part of the trial closest to source/target break point.
name: str, optional
task: int, optional
log_level: int, logbook.level
"""
sample_params_keys = {'sample_duration', 'time_gap'}
assert isinstance(trial_params, dict) and sample_params_keys <= set(trial_params.keys()),\
'Expected dict. <trial_params> contain keys: {}, got: {}'.format(sample_params_keys, trial_params)
assert isinstance(episode_params, dict) and sample_params_keys <= set(episode_params.keys()), \
'Expected dict. <episode_params> contain keys: {}, got: {}'.format(sample_params_keys, episode_params)
if parsing_params is None:
parsing_params = dict(
# Default parameters for source-specific CSV datafeed class,
# correctly parses 1 minute Forex generic ASCII
# data files from www.HistData.com:
# CSV to Pandas params.
sep=';',
header=0,
index_col=0,
parse_dates=True,
names=['open', 'high', 'low', 'close', 'volume'],
# Pandas to BT.feeds params:
timeframe=1, # 1 minute.
datetime=0,
open=1,
high=2,
low=3,
close=4,
volume=-1,
openinterest=-1,
)
# Hacky cause we want trial test period to be attr of Trial instance
# and top-level test (target) period to be attribute of Domain instance:
try:
trial_test_period = trial_params.pop('test_period')
except(AttributeError, KeyError):
trial_test_period = {'days': 0, 'hours': 0, 'minutes': 0}
episode_params.update({'test_period': trial_test_period})
# if target_period is None:
# target_period = {'days': 0, 'hours': 0, 'minutes': 0}
trial_params['test_period'] = target_period
# Setting target backshift:
if use_target_backshift:
trial_params['_test_period_backshift_delta'] =\
datetime.timedelta(**trial_params['sample_duration']) - datetime.timedelta(**trial_test_period)
episode_config = dict(
class_ref=self.episode_class_ref,
kwargs=dict(
parsing_params=parsing_params,
sampling_params=None,
name='episode',
task=task,
log_level=log_level,
_config_stack=None,
),
)
trial_config = dict(
class_ref=self.trial_class_ref,
kwargs=dict(
parsing_params=parsing_params,
sampling_params=episode_params,
name='trial',
task=task,
frozen_time_split=frozen_time_split,
log_level=log_level,
_config_stack=[episode_config],
),
)
super(BTgymRandomDataDomain, self).__init__(
filename=filename,
dataframe=dataframe,
parsing_params=parsing_params,
sampling_params=trial_params,
name=name,
task=task,
frozen_time_split=frozen_time_split,
data_names=data_names,
log_level=log_level,
_config_stack=[episode_config, trial_config]
)
[docs]class BTgymDataset(BTgymRandomDataDomain):
"""
Simple top-level data class, implements direct random episode sampling from data set induced by csv file,
i.e it is a special case for `Trial=def=Episode`.
Supports source and target data domains separation with some caveat - see Note.
Note:
Due to current implementation sampling test episode actually requires sampling test TRIAL.
To be improved.
"""
[docs] class BTgymSimpleTrial(BTgymDataTrial):
"""
Truncated Trial without test period: always samples from train,
sampled episode inherits tarin/test metadata of parent trail.
"""
def sample(self, sample_type=0, **kwargs):
episode = self._sample(sample_type=0, **kwargs)
episode.metadata['type'] = sample_type
return episode
# Override trial sample class:
trial_class_ref = BTgymSimpleTrial
params_deprecated=dict(
episode_len_days=('episode_duration', 'days'),
episode_len_hours=('episode_duration','hours'),
episode_len_minutes=('episode_duration', 'minutes'),
time_gap_days=('time_gap', 'days'),
time_gap_hours=('time_gap', 'hours')
)
def __init__(
self,
filename=None,
episode_duration=None,
time_gap=None,
start_00=False,
start_weekdays=None,
parsing_params=None,
target_period=None,
name='SimpleDataSet',
data_names=('default_asset',),
log_level=WARNING,
**kwargs
):
"""
Args:
filename: Str or list of str, file_names containing CSV historic data;
episode_duration: dict, maximum episode duration in d:h:m, def={'days': 0, 'hours': 23, 'minutes': 55},
alias for `sample_duration`;
time_gap: dict, data time gap allowed within sample in d:h:m, def={'days': 0, 'hours': 6};
start_00: bool, episode start point will be shifted back to first record;
of the day (usually 00:00), def=False;
start_weekdays: list, only weekdays from the list will be used for sample start,
def=[0, 1, 2, 3, 4, 5, 6];
target_period: domain test(aka target) period. def={'days': 0, 'hours': 0, 'minutes': 0};
setting this param to non-zero duration forces data separation to train/test
subsets. Train data always precedes test one.
parsing_params: csv parsing options, see base class description for details;
name: str, instance name;
log_level: int, logbook.level;
**kwargs: deprecated kwargs;
"""
# Default sample time duration:
if episode_duration is None:
self._episode_duration = dict(
days=0,
hours=23,
minutes=55,
)
else:
self._episode_duration = episode_duration
# Default data time gap allowed within sample:
if time_gap is None:
self._time_gap = dict(
days=0,
hours=6,
)
else:
self._time_gap = time_gap
# Default weekdays:
if start_weekdays is None:
start_weekdays = [0, 1, 2, 3, 4, 5, 6]
# Insert deprecated params, if any:
for key, value in kwargs.items():
if key in self.params_deprecated.keys():
self.log.warning(
'Key: <{}> is deprecated, use: <{}> instead'.format(key, self.params_deprecated[key])
)
key1, key2 = self.params_deprecated[key]
attr = getattr(self, key1)
attr[key2] = value
trial_params = dict(
sample_duration=self._episode_duration,
start_weekdays=start_weekdays,
start_00=start_00,
time_gap=self._time_gap,
# test_period={'days': 0, 'hours': 0, 'minutes': 0},
test_period=target_period,
expanding=False
)
episode_params = trial_params.copy()
super(BTgymDataset, self).__init__(
filename=filename,
parsing_params=parsing_params,
trial_params=trial_params,
episode_params=episode_params,
target_period=target_period,
name=name,
data_names=data_names,
log_level=log_level,
)
[docs]class BTgymDataset2(BTgymRandomDataDomain):
"""
Simple top-level data class, implements direct random episode sampling from data set induced by csv file,
i.e it is a special case for `Trial=def=Episode`.
"""
def __init__(
self,
filename=None,
dataframe=None,
episode_duration=None,
time_gap=None,
start_00=False,
start_weekdays=None,
parsing_params=None,
target_period=None,
name='SimpleDataSet2',
data_names=('default_asset',),
log_level=WARNING,
**kwargs
):
"""
Args:
filename: Str or list of str, file_names containing CSV historic data;
dataframe: pd.dataframe or iterable of pd.dataframes containing historic data;
episode_duration: dict, maximum episode duration in d:h:m, def={'days': 0, 'hours': 23, 'minutes': 55},
alias for `sample_duration`;
time_gap: dict, data time gap allowed within sample in d:h:m, def={'days': 0, 'hours': 6};
start_00: bool, episode start point will be shifted back to first record;
of the day (usually 00:00), def=False;
start_weekdays: list, only weekdays from the list will be used for sample start,
def=[0, 1, 2, 3, 4, 5, 6];
target_period: domain test(aka target) period. def={'days': 0, 'hours': 0, 'minutes': 0};
setting this param to non-zero duration forces data separation to train/test
subsets. Train data always precedes test one.
parsing_params: csv parsing options, see base class description for details;
name: str, instance name;
log_level: int, logbook.level;
**kwargs:
"""
# Default sample time duration:
if episode_duration is None:
self._episode_duration = dict(
days=0,
hours=23,
minutes=55,
)
else:
self._episode_duration = episode_duration
# Default data time gap allowed within sample:
if time_gap is None:
self._time_gap = dict(
days=0,
hours=6,
)
else:
self._time_gap = time_gap
# Default weekdays:
if start_weekdays is None:
start_weekdays = [0, 1, 2, 3, 4, 5, 6]
trial_params = dict(
sample_duration=self._episode_duration,
start_weekdays=start_weekdays,
start_00=start_00,
time_gap=self._time_gap,
# test_period={'days': 0, 'hours': 0, 'minutes': 0},
test_period=target_period,
expanding=False
)
episode_params = trial_params.copy()
super(BTgymDataset2, self).__init__(
filename=filename,
dataframe=dataframe,
parsing_params=parsing_params,
trial_params=trial_params,
episode_params=episode_params,
target_period=target_period,
name=name,
data_names=data_names,
log_level=log_level,
)