What is wrong with this reinforcement learning environment ?

https://datascience.stackexchange.com/questions/37081

31-10-2019
|

Question

I'm working on below reinforcement learning problem: I have bottle of fix capacity (say 5 liters). At the bottom of bottle there is cock to remove water. The distribution of removal of water is not fixed. we can remove any amount of water from bottle, i.e. any continuous value between [0, 5].

At the top of the bottle one tap is mounted to fill water in the bottle. RL agent can fill [0, 1, 2, 3, 4] liters in the bottle. Initial bottle level is any value between [0, 5].

I want to train the agent in this environment to get optimal sequence of actions such that bottle will not get empty and overflow which implies continuous supply of water demand.

Action space = [0, 1, 2, 3, 4] Discrete Space

Observation Space = [0, Capacity of Bottle] i.e. [0, 5] Continuous Space

Reward logic = if bottle empty due to action give negative rewards; if bottle overflow due to action give negative rewards

I have decided to use python to create an environment.

from gym import spaces
import numpy as np

class WaterEnv():
    def __init__(self, BottleCapacity = 5):
        ## CONSTANTS
        self.MinLevel = 0 # minimum water level
        self.BottleCapacity = BottleCapacity # bottle capacity
            # action space
        self.action_space = spaces.Discrete(self.BottleCapacity)
            # observation space
        self.observation_space = spaces.Box(low=self.MinLevel, high=self.BottleCapacity,
                                            shape=(1,))
        # initial bottle level
        self.initBlevel = self.observation_space.sample()

    def step(self, action):
        # water qty to remove
        WaterRemoveQty = np.random.uniform(self.MinLevel, self.BottleCapacity, 1)

        # updated water level after removal of water
        UpdatedWaterLevel = (self.initBlevel - WaterRemoveQty)
        # add water - action taken
        UpdatedWaterLevel_ = UpdatedWaterLevel +  action

        if UpdatedWaterLevel_ <= self.MinLevel:
            reward = -1
            done = True
        elif UpdatedWaterLevel_ > self.BottleCapacity:
            reward = -1
            done = True
        else:
            reward = 0.5
            done = False

        return UpdatedWaterLevel_, reward, done

    def reset(self):
        """
        Reset the initial bottle value
        """
        self.initBlevel = self.observation_space.sample()

        return self.initBlevel

import random
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import sgd

class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000) # memory size
        self.gamma = 0.99    # discount rate
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.01 # minmun exploration rate
        self.epsilon_decay = 0.99 # exploration decay
        self.learning_rate = 0.001 # learning rate
        self.model = self._build_model()

    def _build_model(self):
        # Neural Net for Deep-Q learning Model
        model = Sequential()
        model.add(Dense(256, input_dim=self.state_size, activation='relu'))
        model.add(Dense(256, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(loss='mse',
                      optimizer=sgd(lr=self.learning_rate))
        return model

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])  # returns action

    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                target = (reward + self.gamma *
                          np.amax(self.model.predict(next_state)[0]))
            target_f = self.model.predict(state)
            target_f[0][action] = target
            self.model.fit(state, target_f, epochs=1, verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

# create iSilo enviroment object
env = WaterEnv()

state_size = env.observation_space.shape[0]
action_size = env.action_space.n

minibatch = 32

# Initialize agent
agent = DQNAgent(state_size, action_size)

done = False
lReward = []  # carry the reward upto end of simulation
rewardAll = 0
XArray = [] # carry the actions upto end of simulation

EPOCHS = 1000

for e in range(EPOCHS):
        #state = np.reshape(state, [1, 1])
        # reset state in the beginning of each epoch
        state = env.reset()
        time_t = 0
        rewardAll = 0

        while True:
            # Decide action
            #state = np.reshape(state, [1, 1])
            action = agent.act(state)

            next_state,reward, done = env.step(action)
            #reward = reward if not done else -10

            # Remember the previous state, action, reward, and done
            #next_state = np.reshape(next_state, [1, state_size])
            agent.remember(state, action, reward, next_state, done)
            # remembering the action for perfrormace check
            XArray.append(action)
            # Assign next_state the new current state for the next frame.
            state = next_state

            if done:
                print("  episode: {}/{}, score: {}, e: {:.2}"
                      .format(e, EPOCHS, time_t, agent.epsilon))
                break
            rewardAll += reward

            # experience and reply
            if len(agent.memory) > minibatch:
                agent.replay(minibatch)

        lReward.append(rewardAll) # append the rewards

After running the 1000 epoch, I observed that agent has not learned anything. Unable to find out whats going wrong.

No correct solution

Licensed under: CC-BY-SA with attribution

Not affiliated with datascience.stackexchange