# Copyright (c) Facebook, Inc. and its affiliates.
# This source code is licensed under the MIT license found in the LICENSE file in the root directory of this source tree.
import numpy as np
from gym import spaces
from numpy import cos, pi, sin
from mtenv import MTEnv
from mtenv.utils import seeding
__copyright__ = "Copyright 2013, RLPy http://acl.mit.edu/RLPy"
__credits__ = [
"Alborz Geramifard",
"Robert H. Klein",
"Christoph Dann",
"William Dabney",
"Jonathan P. How",
]
__license__ = "BSD 3-Clause"
__author__ = "Christoph Dann <cdann@cdann.de>"
# SOURCE:
# https://github.com/rlpy/rlpy/blob/master/rlpy/Domains/Acrobot.py
[docs]class MTAcrobot(MTEnv):
"""A acrobot environment with varying characteristics
The task descriptor is composed of values between -1 and +1 and mapped to acrobot physical characcteristics in the
self._mu_to_vars function.
"""
metadata = {"render.modes": ["human", "rgb_array"], "video.frames_per_second": 15}
dt = 0.2
def _mu_to_vars(self, mu):
self.LINK_LENGTH_1 = 1.0 + mu[0] * 0.5
self.LINK_LENGTH_2 = 1.0 + mu[1] * 0.5
self.LINK_MASS_1 = 1.0 + mu[2] * 0.5
self.LINK_MASS_2 = 1.0 + mu[3] * 0.5
self.LINK_COM_POS_1 = 0.5
self.LINK_COM_POS_2 = 0.5
if mu[6] > 0:
self.AVAIL_TORQUE = [-1.0, 0.0, 1.0]
else:
self.AVAIL_TORQUE = [1.0, 0.0, -1.0]
self.LINK_MOI = 1.0
torque_noise_max = 0.0
MAX_VEL_1 = 4 * pi + pi
MAX_VEL_2 = 9 * pi + 2 * pi
#: use dynamics equations from the nips paper or the book
book_or_nips = "book"
action_arrow = None
domain_fig = None
actions_num = 3
def __init__(self):
self.viewer = None
self.action_space = spaces.Discrete(3)
self.state = None
high = np.array(
[1.5, 1.5, 1.5, 1.5, self.MAX_VEL_1, self.MAX_VEL_2], dtype=np.float32
)
low = -high
observation_space = spaces.Box(low=low, high=high, dtype=np.float32)
action_space = spaces.Discrete(3)
high = np.array([1.0 for k in range(5)])
task_space = spaces.Box(-high, high, dtype=np.float32)
super().__init__(
action_space=action_space,
env_observation_space=observation_space,
task_observation_space=task_space,
)
[docs] def step(self, a):
self.t += 1
self._mu_to_vars(self.task_state)
s = self.state
torque = self.AVAIL_TORQUE[a]
# Add noise to the force action
if self.torque_noise_max > 0:
torque += self.np_random_env.uniform(
-self.torque_noise_max, self.torque_noise_max
)
# Now, augment the state with our force action so it can be passed to
# _dsdt
s_augmented = np.append(s, torque)
ns = rk4(self._dsdt, s_augmented, [0, self.dt])
# only care about final timestep of integration returned by integrator
ns = ns[-1]
ns = ns[:4] # omit action
# ODEINT IS TOO SLOW!
# ns_continuous = integrate.odeint(self._dsdt, self.s_continuous, [0, self.dt])
# self.s_continuous = ns_continuous[-1] # We only care about the state
# at the ''final timestep'', self.dt
ns[0] = wrap(ns[0], -pi, pi)
ns[1] = wrap(ns[1], -pi, pi)
ns[2] = bound(ns[2], -self.MAX_VEL_1, self.MAX_VEL_1)
ns[3] = bound(ns[3], -self.MAX_VEL_2, self.MAX_VEL_2)
self.state = ns
terminal = self._terminal()
reward = -1.0 if not terminal else 0.0
return (
{"env_obs": self._get_obs(), "task_obs": self.get_task_obs()},
reward,
terminal,
{},
)
[docs] def reset(self):
self._mu_to_vars(self.task_state)
self.state = self.np_random_env.uniform(low=-0.1, high=0.1, size=(4,))
self.t = 0
return {"env_obs": self._get_obs(), "task_obs": self.get_task_obs()}
[docs] def get_task_obs(self):
return self.task_state
[docs] def get_task_state(self):
return self.task_state
[docs] def set_task_state(self, task_state):
self.task_state = task_state
def _get_obs(self):
s = self.state
return [cos(s[0]), sin(s[0]), cos(s[1]), sin(s[1]), s[2], s[3]]
def _terminal(self):
s = self.state
return bool(-cos(s[0]) - cos(s[1] + s[0]) > 1.0)
def _dsdt(self, s_augmented, t):
m1 = self.LINK_MASS_1
m2 = self.LINK_MASS_2
l1 = self.LINK_LENGTH_1
lc1 = self.LINK_COM_POS_1
lc2 = self.LINK_COM_POS_2
I1 = self.LINK_MOI
I2 = self.LINK_MOI
g = 9.8
a = s_augmented[-1]
s = s_augmented[:-1]
theta1 = s[0]
theta2 = s[1]
dtheta1 = s[2]
dtheta2 = s[3]
d1 = (
m1 * lc1 ** 2
+ m2 * (l1 ** 2 + lc2 ** 2 + 2 * l1 * lc2 * cos(theta2))
+ I1
+ I2
)
d2 = m2 * (lc2 ** 2 + l1 * lc2 * cos(theta2)) + I2
phi2 = m2 * lc2 * g * cos(theta1 + theta2 - pi / 2.0)
phi1 = (
-m2 * l1 * lc2 * dtheta2 ** 2 * sin(theta2)
- 2 * m2 * l1 * lc2 * dtheta2 * dtheta1 * sin(theta2)
+ (m1 * lc1 + m2 * l1) * g * cos(theta1 - pi / 2)
+ phi2
)
if self.book_or_nips == "nips":
# the following line is consistent with the description in the
# paper
ddtheta2 = (a + d2 / d1 * phi1 - phi2) / (m2 * lc2 ** 2 + I2 - d2 ** 2 / d1)
else:
# the following line is consistent with the java implementation and the
# book
ddtheta2 = (
a + d2 / d1 * phi1 - m2 * l1 * lc2 * dtheta1 ** 2 * sin(theta2) - phi2
) / (m2 * lc2 ** 2 + I2 - d2 ** 2 / d1)
ddtheta1 = -(d2 * ddtheta2 + phi1) / d1
return (dtheta1, dtheta2, ddtheta1, ddtheta2, 0.0)
[docs] def seed(self, env_seed):
self.np_random_env, seed = seeding.np_random(env_seed)
return [seed]
[docs] def seed_task(self, task_seed):
self.np_random_task, seed = seeding.np_random(task_seed)
return [seed]
[docs] def sample_task_state(self):
self.assert_task_seed_is_set()
super().sample_task_state()
new_task_state = [
self.np_random_task.uniform(-1, 1),
self.np_random_task.uniform(-1, 1),
self.np_random_task.uniform(-1, 1),
self.np_random_task.uniform(-1, 1),
self.np_random_task.uniform(-1, 1),
self.np_random_task.uniform(-1, 1),
self.np_random_task.uniform(-1, 1),
]
return new_task_state
[docs]def wrap(x, m, M):
"""
:param x: a scalar
:param m: minimum possible value in range
:param M: maximum possible value in range
Wraps ``x`` so m <= x <= M; but unlike ``bound()`` which
truncates, ``wrap()`` wraps x around the coordinate system defined by m,M.\n
For example, m = -180, M = 180 (degrees), x = 360 --> returns 0.
"""
diff = M - m
while x > M:
x = x - diff
while x < m:
x = x + diff
return x
[docs]def bound(x, m, M=None):
"""
:param x: scalar
Either have m as scalar, so bound(x,m,M) which returns m <= x <= M *OR*
have m as length 2 vector, bound(x,m, <IGNORED>) returns m[0] <= x <= m[1].
"""
if M is None:
M = m[1]
m = m[0]
# bound x between min (m) and Max (M)
return min(max(x, m), M)
[docs]def rk4(derivs, y0, t, *args, **kwargs):
"""
Integrate 1D or ND system of ODEs using 4-th order Runge-Kutta.
This is a toy implementation which may be useful if you find
yourself stranded on a system w/o scipy. Otherwise use
:func:`scipy.integrate`.
*y0*
initial state vector
*t*
sample times
*derivs*
returns the derivative of the system and has the
signature ``dy = derivs(yi, ti)``
*args*
additional arguments passed to the derivative function
*kwargs*
additional keyword arguments passed to the derivative function
Example 1 ::
## 2D system
def derivs6(x,t):
d1 = x[0] + 2*x[1]
d2 = -3*x[0] + 4*x[1]
return (d1, d2)
dt = 0.0005
t = arange(0.0, 2.0, dt)
y0 = (1,2)
yout = rk4(derivs6, y0, t)
Example 2::
## 1D system
alpha = 2
def derivs(x,t):
return -alpha*x + exp(-t)
y0 = 1
yout = rk4(derivs, y0, t)
If you have access to scipy, you should probably be using the
scipy.integrate tools rather than this function.
"""
try:
Ny = len(y0)
except TypeError:
yout = np.zeros((len(t),), np.float_)
else:
yout = np.zeros((len(t), Ny), np.float_)
yout[0] = y0
for i in np.arange(len(t) - 1):
thist = t[i]
dt = t[i + 1] - thist
dt2 = dt / 2.0
y0 = yout[i]
k1 = np.asarray(derivs(y0, thist, *args, **kwargs))
k2 = np.asarray(derivs(y0 + dt2 * k1, thist + dt2, *args, **kwargs))
k3 = np.asarray(derivs(y0 + dt2 * k2, thist + dt2, *args, **kwargs))
k4 = np.asarray(derivs(y0 + dt * k3, thist + dt, *args, **kwargs))
yout[i + 1] = y0 + dt / 6.0 * (k1 + 2 * k2 + 2 * k3 + k4)
return yout
[docs]class Acrobot(MTAcrobot):
"""The original acrobot environment in the MTEnv fashion"""
def __init__(self):
super().__init__()
[docs] def sample_task_state(self):
self.assert_task_seed_is_set()
super().sample_task_state()
new_task_state = [
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
]
return new_task_state
if __name__ == "__main__":
env = MTAcrobot()
env.seed(5)
env.seed_task(15)
env.reset_task_state()
obs = env.reset()
print(obs)
done = False
while not done:
obs, rew, done, _ = env.step(np.random.randint(env.action_space.n))
print(obs)