Source code for btgym.datafeed.derivative

###############################################################################
#
# Copyright (C) 2017-2018 Andrew Muzikin
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
#
###############################################################################

from logbook import WARNING
from .base import BTgymBaseData
import datetime


[docs]class BTgymEpisode(BTgymBaseData):
    """
    Low-level data class.
    Implements `Episode` object containing single episode data sequence.
    Doesnt allows further sampling and data loading.
    Supposed to be converted to bt.datafeed object via .to_btfeed() method.
    Do not use directly.
    """
    def __init__(
            self,
            filename=None,
            parsing_params=None,
            sampling_params=None,
            name=None,
            data_names=('default_asset',),
            task=0,
            log_level=WARNING,
            _config_stack=None,
    ):

        super(BTgymEpisode, self).__init__(
            filename=filename,
            parsing_params=parsing_params,
            sampling_params=None,
            name='episode',
            task=task,
            data_names=data_names,
            log_level=log_level,
            _config_stack=_config_stack
        )

    def reset(self, **kwargs):
        raise RuntimeError('Episode object doesnt support .reset() method.')

    def sample(self, **kwargs):
        raise RuntimeError('Episode object doesnt support .sample() method.')


[docs]class BTgymDataTrial(BTgymBaseData):
    """
    Intermediate-level data class.
    Implements conception of `Trial` object.
    Supports data train/test separation.
    Do not use directly.
    """
    trial_params = dict(
        nested_class_ref=BTgymEpisode,
    )

    def __init__(
            self,
            filename=None,
            parsing_params=None,
            sampling_params=None,
            name=None,
            data_names=('default_asset',),
            frozen_time_split=None,
            task=0,
            log_level=WARNING,
            _config_stack=None,


    ):
        """
        Args:
            filename:           not used;
            sampling_params:    dict, sample retrieving options, see base class description for details;
            task:               int, optional;
            parsing_params:     csv parsing options, see base class description for details;
            log_level:          int, optional, logbook.level;
            _config_stack:      dict, holding configuration for nested child samples;
        """

        super(BTgymDataTrial, self).__init__(
            filename=filename,
            parsing_params=parsing_params,
            sampling_params=sampling_params,
            name='Trial',
            data_names=data_names,
            frozen_time_split=frozen_time_split,
            task=task,
            log_level=log_level,
            _config_stack=_config_stack
        )


[docs]class BTgymRandomDataDomain(BTgymBaseData):
    """
    Top-level data class. Implements one way data domains can be defined,
    namely when source domain precedes and target one. Implements pipe::

        Domain.sample() --> Trial.sample() --> Episode.to_btfeed() --> bt.Startegy

    This particular class randomly samples Trials from provided dataset.

    """
    # Classes to use for sample objects:
    trial_class_ref = BTgymDataTrial
    episode_class_ref = BTgymEpisode

    def __init__(
            self,
            trial_params,
            episode_params,
            filename=None,
            dataframe=None,
            parsing_params=None,
            target_period=None,
            use_target_backshift=False,
            frozen_time_split=None,
            name='RndDataDomain',
            task=0,
            data_names=('default_asset',),
            log_level=WARNING,
    ):
        """
        Args:
            filename:               Str or list of str, file_names containing CSV historic data;
            dataframe:          pd.dataframe or iterable of pd.dataframes containing historic data;
            parsing_params:         csv parsing options, see base class description for details;
            trial_params:           dict, describes trial parameters, should contain keys:
                                    {sample_duration, time_gap, start_00, start_weekdays, test_period, expanding};
            episode_params:         dict, describes episode parameters, should contain keys:
                                    {sample_duration, time_gap, start_00, start_weekdays};

            target_period:          dict, None or Int, domain target period, def={'days': 0, 'hours': 0, 'minutes': 0};
                                    setting this param to non-zero duration forces separation to source/target
                                    domains (which can be thought of as creating  top-level train/test subsets) with
                                    target data duration equal to `target_period`;
                                    if set to None - no target period assumed;
                                    if set to -1 - no source period assumed;
                                    Source data always precedes target one.
            use_target_backshift:   bool, if true - target domain is shifted back by the duration of trial train period,
                                    thus allowing training on part of target domain data,
                                    namely train part of the trial closest to source/target break point.
            name:                   str, optional
            task:                   int, optional
            log_level:              int, logbook.level
        """
        sample_params_keys = {'sample_duration', 'time_gap'}

        assert isinstance(trial_params, dict) and sample_params_keys <= set(trial_params.keys()),\
            'Expected dict. <trial_params> contain keys: {}, got: {}'.format(sample_params_keys, trial_params)

        assert isinstance(episode_params, dict) and sample_params_keys <= set(episode_params.keys()), \
            'Expected dict. <episode_params> contain keys: {}, got: {}'.format(sample_params_keys, episode_params)

        if parsing_params is None:
            parsing_params = dict(
                # Default parameters for source-specific CSV datafeed class,
                # correctly parses 1 minute Forex generic ASCII
                # data files from www.HistData.com:

                # CSV to Pandas params.
                sep=';',
                header=0,
                index_col=0,
                parse_dates=True,
                names=['open', 'high', 'low', 'close', 'volume'],

                # Pandas to BT.feeds params:
                timeframe=1,  # 1 minute.
                datetime=0,
                open=1,
                high=2,
                low=3,
                close=4,
                volume=-1,
                openinterest=-1,
            )

        # Hacky cause we want trial test period to be attr of Trial instance
        # and top-level test (target) period to be attribute of Domain instance:
        try:
            trial_test_period = trial_params.pop('test_period')

        except(AttributeError, KeyError):
            trial_test_period = {'days': 0, 'hours': 0, 'minutes': 0}

        episode_params.update({'test_period': trial_test_period})

        # if target_period is None:
        #     target_period = {'days': 0, 'hours': 0, 'minutes': 0}

        trial_params['test_period'] = target_period

        # Setting target backshift:
        if use_target_backshift:
            trial_params['_test_period_backshift_delta'] =\
                datetime.timedelta(**trial_params['sample_duration']) - datetime.timedelta(**trial_test_period)

        episode_config = dict(
            class_ref=self.episode_class_ref,
            kwargs=dict(
                parsing_params=parsing_params,
                sampling_params=None,
                name='episode',
                task=task,
                log_level=log_level,
                _config_stack=None,
            ),
        )
        trial_config = dict(
            class_ref=self.trial_class_ref,
            kwargs=dict(
                parsing_params=parsing_params,
                sampling_params=episode_params,
                name='trial',
                task=task,
                frozen_time_split=frozen_time_split,
                log_level=log_level,
                _config_stack=[episode_config],
            ),
        )

        super(BTgymRandomDataDomain, self).__init__(
            filename=filename,
            dataframe=dataframe,
            parsing_params=parsing_params,
            sampling_params=trial_params,
            name=name,
            task=task,
            frozen_time_split=frozen_time_split,
            data_names=data_names,
            log_level=log_level,
            _config_stack=[episode_config, trial_config]
        )


[docs]class BTgymDataset(BTgymRandomDataDomain):
    """
    Simple top-level data class, implements direct random episode sampling from data set induced by csv file,
    i.e it is a special case for `Trial=def=Episode`.
    Supports source and target data domains separation with some caveat - see Note.

    Note:
        Due to current implementation sampling test episode actually requires sampling test TRIAL.
        To be improved.

    """

[docs]    class BTgymSimpleTrial(BTgymDataTrial):
        """
        Truncated Trial without test period: always samples from train,
        sampled episode inherits tarin/test metadata of parent trail.
        """

        def sample(self, sample_type=0, **kwargs):
            episode = self._sample(sample_type=0, **kwargs)
            episode.metadata['type'] = sample_type
            return episode

    # Override trial sample class:
    trial_class_ref = BTgymSimpleTrial

    params_deprecated=dict(
        episode_len_days=('episode_duration', 'days'),
        episode_len_hours=('episode_duration','hours'),
        episode_len_minutes=('episode_duration', 'minutes'),
        time_gap_days=('time_gap', 'days'),
        time_gap_hours=('time_gap', 'hours')
    )

    def __init__(
            self,
            filename=None,
            episode_duration=None,
            time_gap=None,
            start_00=False,
            start_weekdays=None,
            parsing_params=None,
            target_period=None,
            name='SimpleDataSet',
            data_names=('default_asset',),
            log_level=WARNING,
            **kwargs
    ):
        """
        Args:
            filename:           Str or list of str, file_names containing CSV historic data;
            episode_duration:   dict, maximum episode duration in d:h:m, def={'days': 0, 'hours': 23, 'minutes': 55},
                                alias for `sample_duration`;
            time_gap:           dict, data time gap allowed within sample in d:h:m, def={'days': 0, 'hours': 6};
            start_00:           bool, episode start point will be shifted back to first record;
                                of the day (usually 00:00), def=False;
            start_weekdays:     list, only weekdays from the list will be used for sample start,
                                def=[0, 1, 2, 3, 4, 5, 6];
            target_period:      domain test(aka target) period. def={'days': 0, 'hours': 0, 'minutes': 0};
                                setting this param to non-zero duration forces data separation to train/test
                                subsets. Train data always precedes test one.
            parsing_params:     csv parsing options, see base class description for details;
            name:               str, instance name;
            log_level:          int, logbook.level;
            **kwargs:           deprecated kwargs;
        """
        # Default sample time duration:
        if episode_duration is None:
            self._episode_duration = dict(
                    days=0,
                    hours=23,
                    minutes=55,
                )
        else:
            self._episode_duration = episode_duration

        # Default data time gap allowed within sample:
        if time_gap is None:
            self._time_gap = dict(
                days=0,
                hours=6,
            )
        else:
            self._time_gap = time_gap

        # Default weekdays:
        if start_weekdays is None:
            start_weekdays = [0, 1, 2, 3, 4, 5, 6]

        # Insert deprecated params, if any:
        for key, value in kwargs.items():
            if key in self.params_deprecated.keys():
                self.log.warning(
                    'Key: <{}> is deprecated, use: <{}> instead'.format(key, self.params_deprecated[key])
                )
                key1, key2 = self.params_deprecated[key]
                attr = getattr(self, key1)
                attr[key2] = value

        trial_params = dict(
            sample_duration=self._episode_duration,
            start_weekdays=start_weekdays,
            start_00=start_00,
            time_gap=self._time_gap,
            # test_period={'days': 0, 'hours': 0, 'minutes': 0},
            test_period=target_period,
            expanding=False
        )
        episode_params = trial_params.copy()
        super(BTgymDataset, self).__init__(
            filename=filename,
            parsing_params=parsing_params,
            trial_params=trial_params,
            episode_params=episode_params,
            target_period=target_period,
            name=name,
            data_names=data_names,
            log_level=log_level,
        )


[docs]class BTgymDataset2(BTgymRandomDataDomain):
    """
    Simple top-level data class, implements direct random episode sampling from data set induced by csv file,
    i.e it is a special case for `Trial=def=Episode`.
    """
    def __init__(
            self,
            filename=None,
            dataframe=None,
            episode_duration=None,
            time_gap=None,
            start_00=False,
            start_weekdays=None,
            parsing_params=None,
            target_period=None,
            name='SimpleDataSet2',
            data_names=('default_asset',),
            log_level=WARNING,
            **kwargs
    ):
        """
        Args:
            filename:           Str or list of str, file_names containing CSV historic data;
            dataframe:          pd.dataframe or iterable of pd.dataframes containing historic data;
            episode_duration:   dict, maximum episode duration in d:h:m, def={'days': 0, 'hours': 23, 'minutes': 55},
                                alias for `sample_duration`;
            time_gap:           dict, data time gap allowed within sample in d:h:m, def={'days': 0, 'hours': 6};
            start_00:           bool, episode start point will be shifted back to first record;
                                of the day (usually 00:00), def=False;
            start_weekdays:     list, only weekdays from the list will be used for sample start,
                                def=[0, 1, 2, 3, 4, 5, 6];
            target_period:      domain test(aka target) period. def={'days': 0, 'hours': 0, 'minutes': 0};
                                setting this param to non-zero duration forces data separation to train/test
                                subsets. Train data always precedes test one.
            parsing_params:     csv parsing options, see base class description for details;
            name:               str, instance name;
            log_level:          int, logbook.level;
            **kwargs:
        """
        # Default sample time duration:
        if episode_duration is None:
            self._episode_duration = dict(
                    days=0,
                    hours=23,
                    minutes=55,
                )
        else:
            self._episode_duration = episode_duration

        # Default data time gap allowed within sample:
        if time_gap is None:
            self._time_gap = dict(
                days=0,
                hours=6,
            )
        else:
            self._time_gap = time_gap

        # Default weekdays:
        if start_weekdays is None:
            start_weekdays = [0, 1, 2, 3, 4, 5, 6]

        trial_params = dict(
            sample_duration=self._episode_duration,
            start_weekdays=start_weekdays,
            start_00=start_00,
            time_gap=self._time_gap,
            # test_period={'days': 0, 'hours': 0, 'minutes': 0},
            test_period=target_period,
            expanding=False
        )
        episode_params = trial_params.copy()
        super(BTgymDataset2, self).__init__(
            filename=filename,
            dataframe=dataframe,
            parsing_params=parsing_params,
            trial_params=trial_params,
            episode_params=episode_params,
            target_period=target_period,
            name=name,
            data_names=data_names,
            log_level=log_level,
        )