Source code for mtenv.envs.control.cartpole

# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved

import math

import numpy as np
from gym import logger, spaces

from mtenv import MTEnv
from mtenv.utils import seeding

"""
Classic cart-pole system implemented based on Rich Sutton et al.
Copied from http://incompleteideas.net/sutton/book/code/pole.c
permalink: https://perma.cc/C9ZM-652R
"""


[docs]class MTCartPole(MTEnv): """A cartpole environment with varying physical values (see the self._mu_to_vars function) """ metadata = {"render.modes": ["human", "rgb_array"], "video.frames_per_second": 50} def _mu_to_vars(self, mu): self.gravity = 9.8 + mu[0] * 5 self.masscart = 1.0 + mu[1] * 0.5 self.masspole = 0.1 + mu[2] * 0.09 self.total_mass = self.masspole + self.masscart self.length = 0.5 + mu[3] * 0.3 self.polemass_length = self.masspole * self.length self.force_mag = 10 * mu[4] if mu[4] == 0: self.force_mag = 10 def __init__(self): # Angle limit set to 2 * theta_threshold_radians so failing observation is still within bounds self.x_threshold = 2.4 self.theta_threshold_radians = 12 * 2 * math.pi / 360 high = np.array( [ self.x_threshold * 2, np.finfo(np.float32).max, self.theta_threshold_radians * 2, np.finfo(np.float32).max, ] ) observation_space = spaces.Box(-high, high, dtype=np.float32) action_space = spaces.Discrete(2) high = np.array([1.0 for k in range(5)]) task_space = spaces.Box(-high, high, dtype=np.float32) super().__init__( action_space=action_space, env_observation_space=observation_space, task_observation_space=task_space, ) self.gravity = 9.8 self.masscart = 1.0 self.masspole = 0.1 self.total_mass = self.masspole + self.masscart self.length = 0.5 # actually half the pole's length self.polemass_length = self.masspole * self.length self.force_mag = 10.0 self.tau = 0.02 # seconds between state updates self.kinematics_integrator = "euler" # Angle at which to fail the episode self.state = None self.steps_beyond_done = None self.task_state = None
[docs] def step(self, action): self.t += 1 self._mu_to_vars(self.task_state) assert self.action_space.contains(action), "%r (%s) invalid" % ( action, type(action), ) state = self.state x, x_dot, theta, theta_dot = state force = self.force_mag if action == 1 else -self.force_mag costheta = math.cos(theta) sintheta = math.sin(theta) temp = ( force + self.polemass_length * theta_dot * theta_dot * sintheta ) / self.total_mass thetaacc = (self.gravity * sintheta - costheta * temp) / ( self.length * (4.0 / 3.0 - self.masspole * costheta * costheta / self.total_mass) ) xacc = temp - self.polemass_length * thetaacc * costheta / self.total_mass if self.kinematics_integrator == "euler": x = x + self.tau * x_dot x_dot = x_dot + self.tau * xacc theta = theta + self.tau * theta_dot theta_dot = theta_dot + self.tau * thetaacc else: # semi-implicit euler x_dot = x_dot + self.tau * xacc x = x + self.tau * x_dot theta_dot = theta_dot + self.tau * thetaacc theta = theta + self.tau * theta_dot self.state = [x, x_dot, theta, theta_dot] done = ( x < -self.x_threshold or x > self.x_threshold or theta < -self.theta_threshold_radians or theta > self.theta_threshold_radians ) done = bool(done) reward = 0 if not done: reward = 1.0 elif self.steps_beyond_done is None: # Pole just fell! self.steps_beyond_done = 0 reward = 1.0 else: if self.steps_beyond_done == 0: logger.warn( "You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior." ) print( "You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior." ) self.steps_beyond_done += 1 reward = 0.0 return ( {"env_obs": self.state, "task_obs": self.get_task_obs()}, reward, done, {}, )
[docs] def reset(self, **args): self.assert_env_seed_is_set() assert self.task_state is not None self._mu_to_vars(self.task_state) self.state = self.np_random_env.uniform(low=-0.05, high=0.05, size=(4,)) self.steps_beyond_done = None self.t = 0 return {"env_obs": self.state, "task_obs": self.get_task_obs()}
[docs] def get_task_obs(self): return self.task_state
[docs] def get_task_state(self): return self.task_state
[docs] def set_task_state(self, task_state): self.task_state = task_state
[docs] def sample_task_state(self): self.assert_task_seed_is_set() super().sample_task_state() new_task_state = [ self.np_random_task.uniform(-1, 1), self.np_random_task.uniform(-1, 1), self.np_random_task.uniform(-1, 1), self.np_random_task.uniform(-1, 1), self.np_random_task.uniform(-1, 1), ] return new_task_state
[docs] def seed(self, env_seed): self.np_random_env, seed = seeding.np_random(env_seed) return [seed]
[docs] def seed_task(self, task_seed): self.np_random_task, seed = seeding.np_random(task_seed) return [seed]
[docs]class CartPole(MTCartPole): """The original cartpole environment in the MTEnv fashion""" def __init__(self): super().__init__()
[docs] def sample_task_state(self): new_task_state = [0.0, 0.0, 0.0, 0.0, 0.0] return new_task_state
if __name__ == "__main__": env = MTCartPole() env.seed(5) env.seed_task(15) env.reset_task_state() obs = env.reset() print(obs) done = False while not done: obs, rew, done, _ = env.step(np.random.randint(env.action_space.n)) print(obs)