Source code for mtenv.envs.control.acrobot

# Copyright (c) Facebook, Inc. and its affiliates.
# This source code is licensed under the MIT license found in the  LICENSE file in the root directory of this source tree.


import numpy as np
from gym import spaces
from numpy import cos, pi, sin

from mtenv import MTEnv
from mtenv.utils import seeding

__copyright__ = "Copyright 2013, RLPy http://acl.mit.edu/RLPy"
__credits__ = [
    "Alborz Geramifard",
    "Robert H. Klein",
    "Christoph Dann",
    "William Dabney",
    "Jonathan P. How",
]
__license__ = "BSD 3-Clause"
__author__ = "Christoph Dann <cdann@cdann.de>"

# SOURCE:
# https://github.com/rlpy/rlpy/blob/master/rlpy/Domains/Acrobot.py


[docs]class MTAcrobot(MTEnv):
    """A acrobot environment with varying characteristics
    The task descriptor is composed of values between -1 and +1 and mapped to acrobot physical characcteristics in the
    self._mu_to_vars function.


    """

    metadata = {"render.modes": ["human", "rgb_array"], "video.frames_per_second": 15}

    dt = 0.2

    def _mu_to_vars(self, mu):
        self.LINK_LENGTH_1 = 1.0 + mu[0] * 0.5
        self.LINK_LENGTH_2 = 1.0 + mu[1] * 0.5
        self.LINK_MASS_1 = 1.0 + mu[2] * 0.5
        self.LINK_MASS_2 = 1.0 + mu[3] * 0.5
        self.LINK_COM_POS_1 = 0.5
        self.LINK_COM_POS_2 = 0.5
        if mu[6] > 0:
            self.AVAIL_TORQUE = [-1.0, 0.0, 1.0]
        else:
            self.AVAIL_TORQUE = [1.0, 0.0, -1.0]
        self.LINK_MOI = 1.0

    torque_noise_max = 0.0
    MAX_VEL_1 = 4 * pi + pi
    MAX_VEL_2 = 9 * pi + 2 * pi

    #: use dynamics equations from the nips paper or the book
    book_or_nips = "book"
    action_arrow = None
    domain_fig = None
    actions_num = 3

    def __init__(self):
        self.viewer = None
        self.action_space = spaces.Discrete(3)
        self.state = None
        high = np.array(
            [1.5, 1.5, 1.5, 1.5, self.MAX_VEL_1, self.MAX_VEL_2], dtype=np.float32
        )
        low = -high
        observation_space = spaces.Box(low=low, high=high, dtype=np.float32)
        action_space = spaces.Discrete(3)
        high = np.array([1.0 for k in range(5)])
        task_space = spaces.Box(-high, high, dtype=np.float32)
        super().__init__(
            action_space=action_space,
            env_observation_space=observation_space,
            task_observation_space=task_space,
        )

[docs]    def step(self, a):
        self.t += 1
        self._mu_to_vars(self.task_state)
        s = self.state
        torque = self.AVAIL_TORQUE[a]

        # Add noise to the force action
        if self.torque_noise_max > 0:
            torque += self.np_random_env.uniform(
                -self.torque_noise_max, self.torque_noise_max
            )

        # Now, augment the state with our force action so it can be passed to
        # _dsdt
        s_augmented = np.append(s, torque)

        ns = rk4(self._dsdt, s_augmented, [0, self.dt])
        # only care about final timestep of integration returned by integrator
        ns = ns[-1]
        ns = ns[:4]  # omit action
        # ODEINT IS TOO SLOW!
        # ns_continuous = integrate.odeint(self._dsdt, self.s_continuous, [0, self.dt])
        # self.s_continuous = ns_continuous[-1] # We only care about the state
        # at the ''final timestep'', self.dt

        ns[0] = wrap(ns[0], -pi, pi)
        ns[1] = wrap(ns[1], -pi, pi)
        ns[2] = bound(ns[2], -self.MAX_VEL_1, self.MAX_VEL_1)
        ns[3] = bound(ns[3], -self.MAX_VEL_2, self.MAX_VEL_2)
        self.state = ns
        terminal = self._terminal()
        reward = -1.0 if not terminal else 0.0
        return (
            {"env_obs": self._get_obs(), "task_obs": self.get_task_obs()},
            reward,
            terminal,
            {},
        )

[docs]    def reset(self):
        self._mu_to_vars(self.task_state)
        self.state = self.np_random_env.uniform(low=-0.1, high=0.1, size=(4,))
        self.t = 0
        return {"env_obs": self._get_obs(), "task_obs": self.get_task_obs()}

[docs]    def get_task_obs(self):
        return self.task_state

[docs]    def get_task_state(self):
        return self.task_state

[docs]    def set_task_state(self, task_state):
        self.task_state = task_state

    def _get_obs(self):
        s = self.state
        return [cos(s[0]), sin(s[0]), cos(s[1]), sin(s[1]), s[2], s[3]]

    def _terminal(self):
        s = self.state
        return bool(-cos(s[0]) - cos(s[1] + s[0]) > 1.0)

    def _dsdt(self, s_augmented, t):
        m1 = self.LINK_MASS_1
        m2 = self.LINK_MASS_2
        l1 = self.LINK_LENGTH_1
        lc1 = self.LINK_COM_POS_1
        lc2 = self.LINK_COM_POS_2
        I1 = self.LINK_MOI
        I2 = self.LINK_MOI
        g = 9.8
        a = s_augmented[-1]
        s = s_augmented[:-1]
        theta1 = s[0]
        theta2 = s[1]
        dtheta1 = s[2]
        dtheta2 = s[3]
        d1 = (
            m1 * lc1 ** 2
            + m2 * (l1 ** 2 + lc2 ** 2 + 2 * l1 * lc2 * cos(theta2))
            + I1
            + I2
        )
        d2 = m2 * (lc2 ** 2 + l1 * lc2 * cos(theta2)) + I2
        phi2 = m2 * lc2 * g * cos(theta1 + theta2 - pi / 2.0)
        phi1 = (
            -m2 * l1 * lc2 * dtheta2 ** 2 * sin(theta2)
            - 2 * m2 * l1 * lc2 * dtheta2 * dtheta1 * sin(theta2)
            + (m1 * lc1 + m2 * l1) * g * cos(theta1 - pi / 2)
            + phi2
        )
        if self.book_or_nips == "nips":
            # the following line is consistent with the description in the
            # paper
            ddtheta2 = (a + d2 / d1 * phi1 - phi2) / (m2 * lc2 ** 2 + I2 - d2 ** 2 / d1)
        else:
            # the following line is consistent with the java implementation and the
            # book
            ddtheta2 = (
                a + d2 / d1 * phi1 - m2 * l1 * lc2 * dtheta1 ** 2 * sin(theta2) - phi2
            ) / (m2 * lc2 ** 2 + I2 - d2 ** 2 / d1)
        ddtheta1 = -(d2 * ddtheta2 + phi1) / d1
        return (dtheta1, dtheta2, ddtheta1, ddtheta2, 0.0)

[docs]    def seed(self, env_seed):
        self.np_random_env, seed = seeding.np_random(env_seed)
        return [seed]

[docs]    def seed_task(self, task_seed):
        self.np_random_task, seed = seeding.np_random(task_seed)
        return [seed]

[docs]    def sample_task_state(self):
        self.assert_task_seed_is_set()
        super().sample_task_state()
        new_task_state = [
            self.np_random_task.uniform(-1, 1),
            self.np_random_task.uniform(-1, 1),
            self.np_random_task.uniform(-1, 1),
            self.np_random_task.uniform(-1, 1),
            self.np_random_task.uniform(-1, 1),
            self.np_random_task.uniform(-1, 1),
            self.np_random_task.uniform(-1, 1),
        ]
        return new_task_state


[docs]def wrap(x, m, M):
    """
    :param x: a scalar
    :param m: minimum possible value in range
    :param M: maximum possible value in range
    Wraps ``x`` so m <= x <= M; but unlike ``bound()`` which
    truncates, ``wrap()`` wraps x around the coordinate system defined by m,M.\n
    For example, m = -180, M = 180 (degrees), x = 360 --> returns 0.
    """
    diff = M - m
    while x > M:
        x = x - diff
    while x < m:
        x = x + diff
    return x


[docs]def bound(x, m, M=None):
    """
    :param x: scalar
    Either have m as scalar, so bound(x,m,M) which returns m <= x <= M *OR*
    have m as length 2 vector, bound(x,m, <IGNORED>) returns m[0] <= x <= m[1].
    """
    if M is None:
        M = m[1]
        m = m[0]
    # bound x between min (m) and Max (M)
    return min(max(x, m), M)


[docs]def rk4(derivs, y0, t, *args, **kwargs):
    """
    Integrate 1D or ND system of ODEs using 4-th order Runge-Kutta.
    This is a toy implementation which may be useful if you find
    yourself stranded on a system w/o scipy.  Otherwise use
    :func:`scipy.integrate`.
    *y0*
        initial state vector
    *t*
        sample times
    *derivs*
        returns the derivative of the system and has the
        signature ``dy = derivs(yi, ti)``
    *args*
        additional arguments passed to the derivative function
    *kwargs*
        additional keyword arguments passed to the derivative function
    Example 1 ::
        ## 2D system
        def derivs6(x,t):
            d1 =  x[0] + 2*x[1]
            d2 =  -3*x[0] + 4*x[1]
            return (d1, d2)
        dt = 0.0005
        t = arange(0.0, 2.0, dt)
        y0 = (1,2)
        yout = rk4(derivs6, y0, t)
    Example 2::
        ## 1D system
        alpha = 2
        def derivs(x,t):
            return -alpha*x + exp(-t)
        y0 = 1
        yout = rk4(derivs, y0, t)
    If you have access to scipy, you should probably be using the
    scipy.integrate tools rather than this function.
    """

    try:
        Ny = len(y0)
    except TypeError:
        yout = np.zeros((len(t),), np.float_)
    else:
        yout = np.zeros((len(t), Ny), np.float_)

    yout[0] = y0

    for i in np.arange(len(t) - 1):

        thist = t[i]
        dt = t[i + 1] - thist
        dt2 = dt / 2.0
        y0 = yout[i]

        k1 = np.asarray(derivs(y0, thist, *args, **kwargs))
        k2 = np.asarray(derivs(y0 + dt2 * k1, thist + dt2, *args, **kwargs))
        k3 = np.asarray(derivs(y0 + dt2 * k2, thist + dt2, *args, **kwargs))
        k4 = np.asarray(derivs(y0 + dt * k3, thist + dt, *args, **kwargs))
        yout[i + 1] = y0 + dt / 6.0 * (k1 + 2 * k2 + 2 * k3 + k4)
    return yout


[docs]class Acrobot(MTAcrobot):
    """The original acrobot environment in the MTEnv fashion"""

    def __init__(self):
        super().__init__()

[docs]    def sample_task_state(self):
        self.assert_task_seed_is_set()
        super().sample_task_state()
        new_task_state = [
            0.0,
            0.0,
            0.0,
            0.0,
            0.0,
            0.0,
            0.0,
        ]
        return new_task_state


if __name__ == "__main__":
    env = MTAcrobot()
    env.seed(5)
    env.seed_task(15)
    env.reset_task_state()
    obs = env.reset()
    print(obs)
    done = False
    while not done:
        obs, rew, done, _ = env.step(np.random.randint(env.action_space.n))
        print(obs)