Source code for interpreter.policies

import gymnasium as gym
from abc import ABC, abstractmethod
import numpy as np
from sklearn.base import RegressorMixin, ClassifierMixin
from stable_baselines3.common.utils import is_vectorized_box_observation

class Policy(ABC):
    Abstract base class for a policy.

    observation_space : gym.Space
        The observation space of the environment.
    action_space : gym.Space
        The action space of the environment.

    def __init__(self, observation_space, action_space):
        self.observation_space = observation_space
        self.action_space = action_space

    def predict(self, obs, state=None, deterministic=True, episode_start=0):
        Predict the action to take given an observation.

        obs : np.ndarray
            The observation input.
        state : object, optional
            The state of the policy (default is None).
        deterministic : bool, optional
            Whether to use a deterministic policy (default is True).
        episode_start : int, optional
            The episode start index (default is 0).

        action : np.ndarray
            The action to take.
        state : object
            The updated state of the policy.
        raise NotImplementedError

class SB3Policy(Policy):
    def __init__(self, base_policy):
        self.base_policy = base_policy
            self.base_policy.observation_space, self.base_policy.action_space

    def predict(self, obs, state=None, deterministic=True, episode_start=0):
        return self.base_policy.predict(obs, state, deterministic, episode_start)

class DTPolicy(Policy):
    Decision Tree Policy class.

    clf : sklearn.base.BaseEstimator
        The decision tree classifier or regressor.
    env : gym.Env
        The environment in which the policy operates.

    def __init__(self, clf, env):
        assert isinstance(env.observation_space, gym.spaces.Box)
        if isinstance(env.action_space, gym.spaces.Box):
            assert isinstance(clf, RegressorMixin)
        elif isinstance(env.action_space, gym.spaces.Discrete):
            assert isinstance(clf, ClassifierMixin)
        super().__init__(env.observation_space, env.action_space)
        self.clf = clf
        # Policy initialization with random samples
            [self.observation_space.sample() for _ in range(1000)],
            [self.action_space.sample() for _ in range(1000)],

    def predict(self, obs, state=None, deterministic=True, episode_start=0):
        Predict the action to take given an observation.

        obs : np.ndarray
            The observation input.
        state : object, optional
            The state of the policy (default is None).
        deterministic : bool, optional
            Whether to use a deterministic policy (default is True).
        episode_start : int, optional
            The episode start index (default is 0).

        action : np.ndarray
            The action to take.
        state : object
            The updated state of the policy.
        if not is_vectorized_box_observation(obs, self.observation_space):
            if isinstance(self.action_space, gym.spaces.Discrete):
                action = self.clf.predict(obs.reshape(1, -1)).squeeze().astype(int)
                if self.action_space.shape[0] > 1:
                    action = self.clf.predict(obs.reshape(1, -1)).squeeze()
                    action = self.clf.predict(obs.reshape(1, -1))
            return action, state
            if isinstance(self.action_space, gym.spaces.Discrete):
                return self.clf.predict(obs).astype(int), None
                if self.action_space.shape[0] > 1:
                    return self.clf.predict(obs), None
                    return self.clf.predict(obs)[:, np.newaxis], None

    def fit(self, S, A):
        Fit the decision tree with the provided observations and actions.

        S : np.ndarray
            The observations.
        A : np.ndarray
            The actions.
        """, A)

[docs] class ObliqueDTPolicy(Policy): """ Oblique Decision Tree Policy class. Parameters ---------- clf : sklearn.base.BaseEstimator The decision tree classifier or regressor. env : gym.Env The environment in which the policy operates. Attributes ---------- clf : sklearn.base.BaseEstimator The decision tree classifier or regressor. observation_space : gym.Space The observation space of the environment. action_space : gym.Space The action space of the environment. """ def __init__(self, clf, env): if isinstance(env.action_space, gym.spaces.Box): assert isinstance(clf, RegressorMixin) elif isinstance(env.action_space, gym.spaces.Discrete): assert isinstance(clf, ClassifierMixin) super().__init__(env.observation_space, env.action_space) self.clf = clf # Policy initialization with clipped random samples init_S = np.array([self.observation_space.sample() for _ in range(1000)]).clip( -2, 2 ) self.get_oblique_data(init_S), [self.action_space.sample() for _ in range(1000)], )
[docs] def get_oblique_data(self, S): """ Generate oblique data by creating pairwise differences between observations. Parameters ---------- S : np.ndarray The input observations. Returns ------- final : np.ndarray The original observations stacked with pairwise differences. """ # Generate indices for the lower triangular part of the matrix indices = np.tril_indices(self.observation_space.shape[0], k=-1) # Tile the rows to create matrices for subtraction a_mat = np.tile(S[:, np.newaxis, :], (1, self.observation_space.shape[0], 1)) b_mat = np.transpose(a_mat, axes=(0, 2, 1)) # Compute the differences and store them in the appropriate location in the result array diffs = a_mat - b_mat result = diffs[:, indices[0], indices[1]] # Stack the original rows with the differences final = np.hstack((S, result)) return final
[docs] def predict(self, obs, state=None, deterministic=True, episode_start=0): """ Predict the action to take given an observation. Parameters ---------- obs : np.ndarray The observation input. state : object, optional The state of the policy (default is None). deterministic : bool, optional Whether to use a deterministic policy (default is True). episode_start : int, optional The episode start index (default is 0). Returns ------- action : np.ndarray The action to take. state : object The updated state of the policy. """ if not is_vectorized_box_observation(obs, self.observation_space): s_mat = np.tile(obs, (self.observation_space.shape[0], 1)) diff_s = s_mat - s_mat.T obs = np.append( obs, diff_s[np.tril_indices(self.observation_space.shape[0], k=-1)] ) if isinstance(self.action_space, gym.spaces.Discrete): action = self.clf.predict(obs.reshape(1, -1)).squeeze().astype(int) else: if self.action_space.shape[0] > 1: action = self.clf.predict(obs.reshape(1, -1)).squeeze() else: action = self.clf.predict(obs.reshape(1, -1)) return action, state else: if isinstance(self.action_space, gym.spaces.Discrete): return self.clf.predict(self.get_oblique_data(obs)).astype(int), None else: if self.action_space.shape[0] > 1: return self.clf.predict(self.get_oblique_data(obs)), None else: return ( self.clf.predict(self.get_oblique_data(obs))[:, np.newaxis], None, )
[docs] def fit(self, S, A): """ Fit the decision tree with the provided oblique observations and actions. Parameters ---------- S : np.ndarray The observations. A : np.ndarray The actions. """, A)