# Source code for rl_coach.exploration_policies.bootstrapped

#
# Copyright (c) 2017 Intel Corporation
#
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#
# Unless required by applicable law or agreed to in writing, software
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
#

from typing import List

import numpy as np

from rl_coach.core_types import RunPhase, ActionType
from rl_coach.exploration_policies.e_greedy import EGreedy, EGreedyParameters
from rl_coach.exploration_policies.exploration_policy import ExplorationParameters
from rl_coach.schedules import Schedule, LinearSchedule
from rl_coach.spaces import ActionSpace

class BootstrappedParameters(EGreedyParameters):
def __init__(self):
super().__init__()
self.bootstrapped_data_sharing_probability = 1.0
self.epsilon_schedule = LinearSchedule(1, 0.01, 1000000)

@property
def path(self):
return 'rl_coach.exploration_policies.bootstrapped:Bootstrapped'

[docs]class Bootstrapped(EGreedy):
"""
Bootstrapped exploration policy is currently only used for discrete action spaces along with the
Bootstrapped DQN agent. It assumes that there is an ensemble of network heads, where each one predicts the
values for all the possible actions. For each episode, a single head is selected to lead the agent, according
to its value predictions. In evaluation, the action is selected using a majority vote over all the heads
predictions.

.. note::
This exploration policy will only work for Discrete action spaces with Bootstrapped DQN style agents,
since it requires the agent to have a network with multiple heads.
"""
def __init__(self, action_space: ActionSpace, epsilon_schedule: Schedule, evaluation_epsilon: float,
"""
:param action_space: the action space used by the environment
:param epsilon_schedule: a schedule for the epsilon values
:param evaluation_epsilon: the epsilon value to use for evaluation phases
:param continuous_exploration_policy_parameters: the parameters of the continuous exploration policy to use
if the e-greedy is used for a continuous policy
"""
super().__init__(action_space, epsilon_schedule, evaluation_epsilon, continuous_exploration_policy_parameters)
self.last_action_values = 0

def get_action(self, action_values: List[ActionType]) -> ActionType:
# action values are none in case the exploration policy is going to select a random action
if action_values is not None:
if self.phase == RunPhase.TRAIN: