Source code for dpdt.dpdt_sklearn

import numpy as np
from sklearn.base import (
    BaseEstimator,
    ClassifierMixin,
    _fit_context,
    RegressorMixin,
    MultiOutputMixin,
)
from sklearn.utils.multiclass import check_classification_targets
from sklearn.utils.validation import check_is_fitted, check_X_y, check_array
from sklearn.utils._param_validation import Interval
from sklearn.metrics import r2_score
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from .mdp_utils import Action, State
from numbers import Integral


[docs] class DPDTreeClassifier(ClassifierMixin, BaseEstimator): """ Dynamic Programming Decision Tree (DPDTree) classifier. Parameters ---------- max_depth : int The maximum depth of the tree. max_nb_trees : int, default=1000 The maximum number of trees. random_state : int, default=42 Fixes randomness of the classifier. Randomness happens in the calls to cart. cart_nodes_list : list of int, default=(3,) List containing the number of leaf nodes for the CART trees at each depth. Attributes ---------- X_ : ndarray, shape (n_samples, n_features) The input passed during :meth:`fit`. y_ : ndarray, shape (n_samples,) The labels passed during :meth:`fit`. classes_ : ndarray, shape (n_classes,) The classes seen at :meth:`fit`. n_features_in_ : int Number of features seen during :term:`fit`. feature_names_in_ : ndarray of shape (`n_features_in_`,) Names of features seen during :term:`fit`. Defined only when `X` has feature names that are all strings. mdp : list of list of State The Markov Decision Process represented as a list of lists of states, where each inner list contains the states at a specific depth. zetas : array-like Array of zeta values to be used in the computation. trees : dict A dictionary representing the tree policies. The keys are tuples representing the state observation and depth, and the values are the optimal tree for each zeta value. init_o : array-like The initial observation of the MDP. Examples -------- >>> from dpdt import DPDTreeClassifier >>> from sklearn import datasets >>> >>> X, y = datasets.load_breast_cancer(return_X_y=True) >>> >>> clf = DPDTreeClassifier(max_depth=3, random_state=42) >>> clf.fit(X, y) >>> print(clf.score(X, y)) """ _parameter_constraints = { "max_depth": [Interval(Integral, 2, None, closed="left")], "max_nb_trees": [Interval(Integral, 1, None, closed="left")], "cart_nodes_list": ["array-like"], "random_state": [Interval(Integral, 0, None, closed="left")], } def __init__( self, max_depth=3, max_nb_trees=1000, cart_nodes_list=(3,), random_state=42 ): # TODO potentially direcltly pass an instantiated CART. self.max_depth = max_depth self.max_nb_trees = max_nb_trees self.cart_nodes_list = cart_nodes_list self.random_state = random_state
[docs] def expand_node_(self, node, depth=0): """ Node Expansion: - For each node at the current depth, calculates the unique classes and their counts for the samples in the node (`node.nz`). - Computes the best possible reward (`rstar`) and the action (`astar`) leading to the next state. - If further expansion is possible (i.e., depth budget allows and there are at least two classes), initializes a `DecisionTreeClassifier` to determine the splits. - Fits the classifier on the samples in the node, and identifies potential splits (features and thresholds). Action Creation and Transition: - For each split, creates an `Action` and determines the left and right child nodes based on the split. - Creates the left and right nodes as new states, and adds transitions to the action. - If an action has valid transitions, adds it to the current node. """ classes, counts = np.unique(self.y_[node.nz], return_counts=True) rstar = max(counts) / node.nz.sum() - 1.0 astar = classes[np.argmax(counts)] next_state = State(self.terminal_state_, [0], is_terminal=True) a = Action(astar) a.transition([rstar] * self.max_nb_trees, 1, next_state) node.add_action(a) # If there is still depth budget and the current split has more than 1 class: if rstar < 0 and depth < self.max_depth: # Get the splits from CART # Note that that 2 leaf nodes means that the split is greedy. if depth <= len(self.cart_nodes_list) - 1: clf = DecisionTreeClassifier( max_leaf_nodes=max(2, self.cart_nodes_list[depth]), random_state=self.random_state, ) # If depth budget reaches limit, get the max entropy split. else: clf = DecisionTreeClassifier( max_leaf_nodes=2, random_state=self.random_state ) clf.fit(self.X_[node.nz], self.y_[node.nz]) # Extract the splits from the CART tree. masks = clf.tree_.feature >= 0 # get tested features. # Apply mask to features and thresholds to get valid indices valid_features = clf.tree_.feature[masks] valid_thresholds = clf.tree_.threshold[masks] lefts = ( self.X_[:, valid_features] <= valid_thresholds ) # is a 2D array with nb CART tree tests columns. rights = np.logical_not( lefts ) # as many rows as data in the whole training set. # Masking data passing threshold and precedent thresholds. lefts *= node.nz[:, np.newaxis] rights *= node.nz[:, np.newaxis] # Compute probabilities p_left = lefts.sum(axis=0) / node.nz.sum() # summing column values. # In each column (tested features), non-zero values are data indices passing all tests so far in the MDP trajectory. p_right = 1 - p_left feat_thresh = list( zip(valid_features, valid_thresholds) ) # len of the list is nb tests in CART tree. # Precompute next observations for left and right splits next_obs_left = np.tile(node.obs, (len(feat_thresh), 1)) next_obs_right = np.tile(node.obs, (len(feat_thresh), 1)) indices = np.arange(len(feat_thresh)) # Fast next obs computations. The next obs in the MDP traj get their bounds updated as the threshold values. next_obs_left[indices, self.X_.shape[1] + valid_features] = valid_thresholds next_obs_right[indices, valid_features] = valid_thresholds # Create Action objects for each split actions = [Action(split) for split in feat_thresh] # Precompute next states for left and right # There should be a pair of next_states per tested features. next_states_left = [ State(next_obs_left[i], lefts[:, i]) for i in range(len(valid_features)) ] next_states_right = [ State(next_obs_right[i], rights[:, i]) for i in range(len(valid_features)) ] # Perform transitions and append states, the reward is equal to the feature cost. for i in range(len(valid_features)): actions[i].transition( self._zetas * self._feature_costs[actions[i].action[0]], p_left[i], next_states_left[i], ) for i in range(len(valid_features)): actions[i].transition( self._zetas * self._feature_costs[actions[i].action[0]], p_right[i], next_states_right[i], ) [node.add_action(action) for action in actions] return node
def _build_mdp_opt_pol(self, infinite_memory=False): """ Build the Markov Decision Process (MDP) for the trees. This method constructs an MDP using a depth-first search approach. Each node in the tree represents a state in the MDP, and actions are determined based on potential splits from a decision tree classifier. This is a depth-first implementation of Algortihm 1 from [1]_ . Returns ------- References ---------- .. [1] H. Kohler et. al., "Interpretable Decision Tree Search as a Markov Decision Process" arXiv https://arxiv.org/abs/2309.12701. """ stack = [(self._root, 0)] expanded = [None] while stack: tmp, d = stack[-1] if tmp is expanded[-1]: tmp.qs = np.zeros( (len(tmp.actions), self.max_nb_trees), dtype=np.float32 ) for a_idx, a in enumerate(tmp.actions): q = np.zeros(self.max_nb_trees, dtype=np.float32) for s, p in zip(a.next_states, a.probas): q += p * s.qs.max(axis=0) tmp.qs[a_idx, :] = np.mean(a.rewards, axis=0) + q idx = np.argmax(tmp.qs, axis=0) self._trees[tuple(tmp.obs.tolist() + [d])] = [ tmp.actions[i].action for i in idx ] tmp.actions = None del expanded[-1] del stack[-1] elif not tmp.is_terminal: tmp = self.expand_node_(tmp, d) expanded.append(tmp) all_next_states = [ j for sub in [a.next_states for a in tmp.actions] for j in sub ] [stack.append((j, d + 1)) for j in all_next_states] else: # tmp is a terminal state # do backprop expanded[-1].actions[0].next_states[0].qs = np.zeros( (1, self.max_nb_trees), dtype=np.float32 ) self._trees[tuple(tmp.obs.tolist() + [d])] = None del stack[-1]
[docs] @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y, feature_costs=None, infinite_memory=False): """ Fit the DPDTree classifier. - Creates a `root` state with the concatenated minimum and maximum values of `self.X_` with slight offsets. - Initializes a `terminal_state` as an array of zeros with a length of twice the number of features in `self.X_`. - Initializes the `root` state with all samples (`nz` as an array of `True` values). Parameters ---------- X : array-like of shape (n_samples, n_features) The training input samples. y : array-like of shape (n_samples,) The target values. feature_costs (optional): list of float, default=None List containing the features costs. infinite_memory (optional): bool, default=None If true force runs garbage collection. This allows to run CART with an infinite number of leave nodes.. It will multiply runtime by 500. Returns ------- self : object Fitted estimator. """ X, y = self._validate_data(X, y) if feature_costs: assert ( len(feature_costs) == X.shape[1] ), "There should be as much feature costs as features." assert all( [fc >= 1 for fc in feature_costs] ), "Feature costs must be greater than 1." min_cost, max_cost = min(feature_costs), max(feature_costs) if min_cost == max_cost: feature_costs = [1 for _ in feature_costs] self._feature_costs = feature_costs else: self._feature_costs = np.ones(X.shape[1]) # We need to make sure that we have a classification task check_classification_targets(y) # classifier should always store the classes seen during `fit` self.classes_ = np.unique(y) # Store the training data to predict later self.X_ = X self.y_ = y if self.max_nb_trees < 2: self._zetas = np.zeros(1) else: self._zetas = np.linspace(-1, 0, self.max_nb_trees) assert len(self._zetas) == self.max_nb_trees self._root = State( np.concatenate((self.X_.min(axis=0) - 1e-3, self.X_.max(axis=0) + 1e-3)), nz=np.ones(self.X_.shape[0], dtype=np.bool_), ) # self._root.obs.tolist() = self._root.obs self.terminal_state_ = np.zeros(2 * self.X_.shape[1]) self._trees = {} self._build_mdp_opt_pol(infinite_memory) # self.recurs_build_mdp_opt_pol_(self._root, depth=0) # Return the classifier return self
[docs] def predict(self, X): """ Predict class for X. Parameters ---------- X : array-like of shape (n_samples, n_features) The input samples. Returns ------- y_pred : array of shape (n_samples,) The predicted classes. """ # Check if fit had been called check_is_fitted(self) # Input validation # We need to set reset=False because we don't want to overwrite `n_features_in_` # `feature_names_in_` but only check that the shape is consistent. X = self._validate_data(X, reset=False) return self._predict_zeta(X, -1)
def _predict_zeta(self, X, zeta_index): """ Predict class for X using a specific zeta index. Parameters ---------- X : array-like of shape (n_samples, n_features) The input samples. zeta_index : int The index of the zeta value to use for prediction. Returns ------- y_pred : array of shape (n_samples,) The predicted classes. """ # X = np.array(X, dtype=np.float64) init_a = self._trees[tuple(self._root.obs.tolist() + [0])][zeta_index] y_pred = np.zeros(len(X), dtype=self.y_.dtype) for i, x in enumerate(X): a = init_a o = self._root.obs.copy() H = 0 while isinstance(a, tuple): # a is int implies leaf node feature, threshold = a H += 1 if x[feature] <= threshold: o[x.shape[0] + feature] = threshold else: o[feature] = threshold a = self._trees[tuple(o.tolist() + [H])][zeta_index] y_pred[i] = a return y_pred def _average_traj_length_in_mdp(self, X, y, zeta_index): """ Calculate the average trajectory length in the MDP. Parameters ---------- X : array-like of shape (n_samples, n_features) The input samples. y : array-like of shape (n_samples,) The target values. zeta_index : int The zeta value to use for calculation. Returns ------- accuracy : float The prediction accuracy. avg_length : float The average trajectory length. """ nb_features = X.shape[1] init_a = self._trees[tuple(self._root.obs.tolist() + [0])][zeta_index] lengths, costs = np.zeros(X.shape[0]), np.zeros(X.shape[0]) for i, s in enumerate(X): a = init_a o = self._root.obs.copy() H = 0 cost = 0 while isinstance(a, tuple): # a is int implies leaf node feature, threshold = a # feature = int(feature) H += 1 cost += self._feature_costs[feature] if s[feature] <= threshold: o[nb_features + feature] = threshold else: o[feature] = threshold a = self._trees[tuple(o.tolist() + [H])][zeta_index] lengths[i] = H costs[i] = cost return ( sum( [ self._predict_zeta(X[i].reshape(1, -1), zeta_index)[0] == y[i] for i in range(len(X)) ] ) / len(X), lengths.mean(), costs.mean(), )
[docs] def get_pareto_front(self, X, y): """ Compute the decision path lengths / test accuracy pareto front of DPDTrees. Parameters ---------- X : array-like of shape (n_samples, n_features) The input samples. y : array-like of shape (n_samples,) The target values. Returns ------- scores : array-like of shape (n_samples) The test accuracies of the trees. decision_path_length : array-like of shape (n_samples) The average number of decision nodes traversal in each tree. """ scores = np.zeros(len(self._zetas)) decision_path_length = np.zeros(len(self._zetas)) cost = np.zeros(len(self._zetas)) for z in range(len(self._zetas)): ( scores[z], decision_path_length[z], cost[z], ) = self._average_traj_length_in_mdp(X, y, z) return scores, decision_path_length, cost
class DPDTreeRegressor(RegressorMixin, MultiOutputMixin, BaseEstimator): """ Dynamic Programming Decision Tree (DPDTree) regressor. Parameters ---------- max_depth : int The maximum depth of the tree. max_nb_trees : int, default=1000 The maximum number of trees. random_state : int, default=42 Fixes randomness of the classifier. Randomness happens in the calls to cart. cart_nodes_list : list of int, default=(3,) List containing the number of leaf nodes for the CART trees at each depth. Attributes ---------- X_ : ndarray, shape (n_samples, n_features) The input passed during :meth:`fit`. y_ : ndarray, shape (n_samples,) The labels passed during :meth:`fit`. classes_ : ndarray, shape (n_classes,) The classes seen at :meth:`fit`. n_features_in_ : int Number of features seen during :term:`fit`. feature_names_in_ : ndarray of shape (`n_features_in_`,) Names of features seen during :term:`fit`. Defined only when `X` has feature names that are all strings. mdp : list of list of State The Markov Decision Process represented as a list of lists of states, where each inner list contains the states at a specific depth. zetas : array-like Array of zeta values to be used in the computation. trees : dict A dictionary representing the tree policies. The keys are tuples representing the state observation and depth, and the values are the optimal tree for each zeta value. init_o : array-like The initial observation of the MDP. """ _parameter_constraints = { "max_depth": [Interval(Integral, 2, None, closed="left")], "max_nb_trees": [Interval(Integral, 1, None, closed="left")], "cart_nodes_list": ["array-like"], "random_state": [Interval(Integral, 0, None, closed="left")], } def __init__( self, max_depth=3, max_nb_trees=1000, cart_nodes_list=(3,), random_state=42 ): # TODO potentially direcltly pass an instantiated CART. self.max_depth = max_depth self.max_nb_trees = max_nb_trees self.cart_nodes_list = cart_nodes_list self.random_state = random_state def expand_node_(self, node, depth=0): """ Node Expansion: - For each node at the current depth, calculates the unique classes and their counts for the samples in the node (`node.nz`). - Computes the best possible reward (`rstar`) and the action (`astar`) leading to the next state. - If further expansion is possible (i.e., depth budget allows and there are at least two classes), initializes a `DecisionTreeClassifier` to determine the splits. - Fits the classifier on the samples in the node, and identifies potential splits (features and thresholds). Action Creation and Transition: - For each split, creates an `Action` and determines the left and right child nodes based on the split. - Creates the left and right nodes as new states, and adds transitions to the action. - If an action has valid transitions, adds it to the current node. """ astar = self.y_[node.nz].mean(axis=0) rstar = -1 * mean_squared_error( self.y_[node.nz], np.tile(astar, (len(self.y_[node.nz]), 1)) ) next_state = State(self.terminal_state_, [0], is_terminal=True) a = Action(astar) a.transition([rstar] * self.max_nb_trees, 1, next_state) node.add_action(a) # If there is still depth budget and the current split has more than 1 class: if rstar < 0 and depth < self.max_depth: # Get the splits from CART # Note that that 2 leaf nodes means that the split is greedy. if depth <= len(self.cart_nodes_list) - 1: clf = DecisionTreeRegressor( max_leaf_nodes=max(2, self.cart_nodes_list[depth]), random_state=self.random_state, ) # If depth budget reaches limit, get the max entropy split. else: clf = DecisionTreeRegressor( max_leaf_nodes=2, random_state=self.random_state ) clf.fit(self.X_[node.nz], self.y_[node.nz]) # Extract the splits from the CART tree. masks = clf.tree_.feature >= 0 # get tested features. # Apply mask to features and thresholds to get valid indices valid_features = clf.tree_.feature[masks] valid_thresholds = clf.tree_.threshold[masks] lefts = ( self.X_[:, valid_features] <= valid_thresholds ) # is a 2D array with nb CART tree tests columns. rights = np.logical_not( lefts ) # as many rows as data in the whole training set. # Masking data passing threshold and precedent thresholds. lefts *= node.nz[:, np.newaxis] rights *= node.nz[:, np.newaxis] # Compute probabilities p_left = lefts.sum(axis=0) / node.nz.sum() # summing column values. # In each column (tested features), non-zero values are data indices passing all tests so far in the MDP trajectory. p_right = 1 - p_left feat_thresh = list( zip(valid_features, valid_thresholds) ) # len of the list is nb tests in CART tree. # Precompute next observations for left and right splits next_obs_left = np.tile(node.obs, (len(feat_thresh), 1)) next_obs_right = np.tile(node.obs, (len(feat_thresh), 1)) indices = np.arange(len(feat_thresh)) # Fast next obs computations. The next obs in the MDP traj get their bounds updated as the threshold values. next_obs_left[indices, self.X_.shape[1] + valid_features] = valid_thresholds next_obs_right[indices, valid_features] = valid_thresholds # Create Action objects for each split actions = [Action(split) for split in feat_thresh] # Precompute next states for left and right # There should be a pair of next_states per tested features. next_states_left = [ State(next_obs_left[i], lefts[:, i]) for i in range(len(valid_features)) ] next_states_right = [ State(next_obs_right[i], rights[:, i]) for i in range(len(valid_features)) ] # Perform transitions and append states, the reward is equal to the feature cost. [ actions[i].transition( self._zetas * self._feature_costs[actions[i].action[0]], p_left[i], next_states_left[i], ) for i in range(len(valid_features)) ] [ actions[i].transition( self._zetas * self._feature_costs[actions[i].action[0]], p_right[i], next_states_right[i], ) for i in range(len(valid_features)) ] [node.add_action(action) for action in actions] return node def _build_mdp_opt_pol(self, infinite_memory=False): """ Build the Markov Decision Process (MDP) for the trees. This method constructs an MDP using a depth-first search approach. Each node in the tree represents a state in the MDP, and actions are determined based on potential splits from a decision tree classifier. This is a depth-first implementation of Algortihm 1 from [1]_ . Returns ------- References ---------- .. [1] H. Kohler et. al., "Interpretable Decision Tree Search as a Markov Decision Process" arXiv https://arxiv.org/abs/2309.12701. """ stack = [(self._root, 0)] expanded = [None] while stack: tmp, d = stack[-1] # print(len(self._trees), len(expanded), len(stack)) if tmp is expanded[-1]: tmp.qs = np.zeros( (len(tmp.actions), self.max_nb_trees), dtype=np.float32 ) for a_idx, a in enumerate(tmp.actions): q = np.zeros(self.max_nb_trees, dtype=np.float32) for s, p in zip(a.next_states, a.probas): # len 2 or 1 q += p * s.qs.max(axis=0) tmp.qs[a_idx, :] = np.mean(a.rewards, axis=0) + q idx = np.argmax(tmp.qs, axis=0) self._trees[tuple(tmp.obs.tolist() + [d])] = [ tmp.actions[i].action for i in idx ] tmp.actions = None del expanded[-1] del stack[-1] elif not tmp.is_terminal: tmp = self.expand_node_(tmp, d) expanded.append(tmp) all_next_states = [ j for sub in [a.next_states for a in tmp.actions] for j in sub ] [stack.append((j, d + 1)) for j in all_next_states] else: # tmp is a terminal state # do backprop expanded[-1].actions[0].next_states[0].qs = np.zeros( (1, self.max_nb_trees), dtype=np.float32 ) self._trees[tuple(tmp.obs.tolist() + [d])] = None del stack[-1] @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y, feature_costs=None, infinite_memory=False): """ Fit the DPDTree classifier. - Creates a `root` state with the concatenated minimum and maximum values of `self.X_` with slight offsets. - Initializes a `terminal_state` as an array of zeros with a length of twice the number of features in `self.X_`. - Initializes the `root` state with all samples (`nz` as an array of `True` values). Parameters ---------- X : array-like of shape (n_samples, n_features) The training input samples. y : array-like of shape (n_samples,) The target values. feature_costs (optional): list of float, default=None List containing the features costs. infinite_memory (optional): bool, default=None If true force runs garbage collection. This allows to run CART with an infinite number of leave nodes.. It will multiply runtime by 500. Returns ------- self : object Fitted estimator. """ X, y = check_X_y(X, y, y_numeric=True, multi_output=True) self._check_n_features(X, reset=True) if feature_costs: assert ( len(feature_costs) == X.shape[1] ), "There should be as much feature costs as features." assert all( [fc >= 1 for fc in feature_costs] ), "Feature costs must be greater than 1." min_cost, max_cost = min(feature_costs), max(feature_costs) if min_cost == max_cost: feature_costs = [1 for _ in feature_costs] self._feature_costs = feature_costs else: self._feature_costs = np.ones(X.shape[1]) # Store the training data to predict later self.X_ = X self.y_ = y.astype(float) if self.max_nb_trees < 2: self._zetas = np.zeros(1) else: self._zetas = np.linspace(-1, 0, self.max_nb_trees) assert len(self._zetas) == self.max_nb_trees self._root = State( np.concatenate((self.X_.min(axis=0) - 1e-3, self.X_.max(axis=0) + 1e-3)), nz=np.ones(self.X_.shape[0], dtype=np.bool_), ) # self._root.obs.tolist() = self._root.obs self.terminal_state_ = np.zeros(2 * self.X_.shape[1]) self._trees = dict() # self.recurs_build_mdp_opt_pol_(root, depth=0) self._build_mdp_opt_pol(infinite_memory) # Return the classifier return self def predict(self, X): """ Predict class for X. Parameters ---------- X : array-like of shape (n_samples, n_features) The input samples. Returns ------- y_pred : array of shape (n_samples,) The predicted classes. """ # Check if fit had been called check_is_fitted(self) # Input validation # We need to set reset=False because we don't want to overwrite `n_features_in_` # `feature_names_in_` but only check that the shape is consistent. X = check_array(X) return self._predict_zeta(X, -1) def _predict_zeta(self, X, zeta_index): """ Predict class for X using a specific zeta index. Parameters ---------- X : array-like of shape (n_samples, n_features) The input samples. zeta_index : int The index of the zeta value to use for prediction. Returns ------- y_pred : array of shape (n_samples,) The predicted classes. """ # X = np.array(X, dtype=np.float64) init_a = self._trees[tuple(self._root.obs.tolist() + [0])][zeta_index] if self.y_.ndim > 1: y_pred = np.zeros((len(X), self.y_.shape[1]), dtype=self.y_.dtype) else: y_pred = np.zeros((len(X)), dtype=self.y_.dtype) for i, x in enumerate(X): a = init_a o = self._root.obs.copy() H = 0 while isinstance(a, tuple): # a is int implies leaf node feature, threshold = a # feature = int(feature) H += 1 if x[feature] <= threshold: o[x.shape[0] + feature] = threshold else: o[feature] = threshold a = self._trees[tuple(o.tolist() + [H])][zeta_index] y_pred[i] = a return y_pred def _average_traj_length_in_mdp(self, X, y, zeta_index): """ Calculate the average trajectory length in the MDP. Parameters ---------- X : array-like of shape (n_samples, n_features) The input samples. y : array-like of shape (n_samples,) The target values. zeta_index : int The zeta value to use for calculation. Returns ------- accuracy : float The prediction accuracy. avg_length : float The average trajectory length. """ nb_features = X.shape[1] init_a = self._trees[tuple(self._root.obs.tolist() + [0])][zeta_index] lengths, costs = np.zeros(X.shape[0]), np.zeros(X.shape[0]) for i, s in enumerate(X): a = init_a o = self._root.obs.copy() H = 0 cost = 0 while isinstance(a, tuple): # a is int implies leaf node feature, threshold = a # feature = int(feature) H += 1 cost += self._feature_costs[feature] if s[feature] <= threshold: o[nb_features + feature] = threshold else: o[feature] = threshold a = self._trees[tuple(o.tolist() + [H])][zeta_index] lengths[i] = H costs[i] = cost return ( r2_score(y, [self._predict_zeta(X[i].reshape(1, -1), zeta_index)[0] for i in range(len(X))]), lengths.mean(), costs.mean(), ) def get_pareto_front(self, X, y): """ Compute the decision path lengths / test accuracy pareto front of DPDTrees. Parameters ---------- X : array-like of shape (n_samples, n_features) The input samples. y : array-like of shape (n_samples,) The target values. Returns ------- scores : array-like of shape (n_samples) The test accuracies of the trees. decision_path_length : array-like of shape (n_samples) The average number of decision nodes traversal in each tree. """ scores = np.zeros(len(self._zetas)) decision_path_length = np.zeros(len(self._zetas)) cost = np.zeros(len(self._zetas)) for z in range(len(self._zetas)): ( scores[z], decision_path_length[z], cost[z], ) = self._average_traj_length_in_mdp(X, y, z) return scores, decision_path_length, cost