Question

Je voulais construire un DQN. Alors j'ai suivi ça code et j'ai regardé quelques vidéos sur l'idée de DQN. Mon code est le that (le mien est écrit en tfrearn et à son keras):

import tflearn as tfl
import numpy as np
import gym
from collections import deque
import random

class DeepQ():
def __init__(self,game="SpaceInvaders-v0"):
    self.game=game
    self.env=gym.make(game)
    self.storage=deque()
    self.filter_size=[4,4]
    self.itertime=1000
    self.random_move_prop=0.8
    np.random.seed(1)
    self.minibatch_size=250
    self.discounted_future_reward=0.9

def Q_Network(self,learning_rate=0.0000001,load=False,model_path=None,checkpoint_path="X://xxx//xxx//Documents//GitHub//Deeplearning_for_starters//Atari_modells//checkpoint.ckpt"):

    if load==False:
        net=tfl.layers.core.input_data(shape=[None,210,160,3])# rework this stuff
        net=tfl.layers.conv.conv_2d(net,nb_filter=3,filter_size=self.filter_size,activation='relu')
        net=tfl.layers.conv.conv_2d(net,nb_filter=3,filter_size=self.filter_size,activation="relu")
        #net=tfl.layers.fully_connected(net,20,activation="relu")
        net=tfl.layers.flatten(net)
        #net=tfl.layers.fully_connected(net,18,activation="relu")
        net=tfl.layers.fully_connected(net,10,activation='relu')
        net=tfl.layers.fully_connected(net,self.env.action_space.n,activation="linear")
        net=tfl.layers.estimator.regression(net,learning_rate=learning_rate)
        self.modell=tfl.DNN(net,checkpoint_path=checkpoint_path)
    else:
        net=tfl.layers.core.input_data(shape=[None,210,160,3])
        net=tfl.layers.conv.conv_2d(net,nb_filter=3,filter_size=self.filter_size,activation='relu')
        net=tfl.layers.conv.conv_2d(net,nb_filter=3,filter_size=self.filter_size,activation="relu")
        #net=tfl.layers.fully_connected(net,20,activation="relu")
        net=tfl.layers.flatten(net)
        #net=tfl.layers.fully_connected(net,18,activation="relu")
        net=tfl.layers.fully_connected(net,10,activation='relu')
        net=tfl.layers.fully_connected(net,self.env.action_space.n,activation="linear")
        net=tfl.layers.estimator.regression(net,learning_rate=learning_rate)
        self.modell=tfl.DNN(net)
        self.modell.load(model_path,weights_only=True)
def Q_Learning(self):
    observation=self.env.reset()
    for i in range(self.itertime):
        #self.env.render()
        observation=observation.reshape(1,210,160,3) 
        if np.random.rand()<=self.random_move_prop: 
            #print("Random step")
            action=np.random.randint(low=0,high=self.env.action_space.n) 
        else:
            #print("Random prediction") #for debugging usefull
            action=self.modell.predict(observation)
            action=np.argmax(action)
        new_observation, reward, done, info=self.env.step(action)
        self.storage.append((observation,action,reward,new_observation,done))
        observation=new_observation
        if done:
            self.env.reset()
    print("###############################################")
    print("Done with observing!")
    print("###############################################")
    minibatch=random.sample(self.storage,self.minibatch_size)# take random observations from our data
    x=np.zeros((self.minibatch_size,)+observation.shape)
    y=np.zeros((self.minibatch_size,self.env.action_space.n))
    for i in range(0,self.minibatch_size):
        Observation=minibatch[i][0]
        Action=minibatch[i][1]
        Reward=minibatch[i][2]
        New_observation=minibatch[i][3]
        done=minibatch[i][4]
        print("Processing batch data... (step:"+str(i)+" from "+str(self.minibatch_size)+")")
        x[i:i+1]=Observation.reshape((1,)+observation.shape)
        y[i]=self.modell.predict(Observation)
        Q_sa=self.modell.predict(Observation)
        if done:
            y[i,action]=reward
        else:
            y[i,action]=reward+self.discounted_future_reward*np.max(Q_sa)
        self.modell.fit_batch(x,y)
    self.modell.save("X://xxx//xxx//xxx//SpaceInvaders1.tfl")
    print("")
    print("Modell fitting acomplished!")
    print("")
def Q_predict(self,model_path="Your path here"):
    self.Q_Network(load=True,model_path=model_path)
    observation=self.env.reset()
    observation=observation.reshape((1,)+observation.shape)
    done=False
    total_reward=0.0
    while not done:
        self.env.render()
        Q=self.modell.predict(observation)
        print(Q)
        action=np.argmax(Q)
        print(action)
        new_observation,reward,done,info=self.env.step(action)
        observation=new_observation
        observation=new_observation.reshape((1,)+observation.shape)
        total_reward+=reward
    print("Game ends with a score of: "+str(total_reward))
    print("")

Le problème est que si j'exécute la fonction de prédiction, le réseau ne fait rien. J'ai compris que Tous les poids sont remplis de nan. Ce que j'ai lu, c'est qu'il peut dépendre du taux d'apprentissage, donc j'ai abaissé le taux de 1e-3 Pour le réel, mais cela n'a rien changé.

Pas de solution correcte

Licencié sous: CC-BY-SA avec attribution
scroll top