diff options
| author | Florian Jung <flo@windfisch.org> | 2016-01-07 00:10:21 +0100 | 
|---|---|---|
| committer | Florian Jung <flo@windfisch.org> | 2016-01-07 00:10:21 +0100 | 
| commit | f9a9a51884aadef97b8952b2807541d31b7e9917 (patch) | |
| tree | fcbe67c97243f91b19067d34cd27237a0bab7f9c | |
| parent | afaa4a050d97197170c3aba7908282010ad83593 (diff) | |
episodic batch learning for QArray. probably broke NN.
| -rw-r--r-- | sol.py | 71 | 
1 files changed, 55 insertions, 16 deletions
@@ -213,10 +213,35 @@ class World:          return self.maze[s[1]][s[0]] == 2 +class QCommon: +    def __init__(self): +        self.alpha = alpha +        self.gamma = gamma +        self.maxdiff = 0.0 + +    # returns a pair of (diff, self.eval(oldstate)). +    # diff is meant to be added to self.eval(oldstate)[action] +    def value_update(self, oldstate, action, newstate, reward): +        eval_newstate = self.eval(newstate) +        eval_oldstate = self.eval(oldstate) +        diff = self.alpha * (reward + self.gamma * max( [ eval_newstate[aa] for aa in directions ] ) - eval_oldstate[action]) +         +        if self.maxdiff < diff: self.maxdiff = diff +         +        return diff, eval_oldstate + +    def episode(self): +        maxdiff = self.maxdiff +        self.maxdiff = 0.0 +        return maxdiff +         +  # abstracts the Q-array. semantics of .eval(x,y) is `Q[y][x]`. semantics of .change((x,y),ac,diff) is `Q[y][x][ac]+=diff` -class QArray: +class QArray(QCommon):      def __init__(self): +        super().__init__()          self.Q = [ [ [0. for k in range(4)] for i in range(a.xlen) ] for j in range(a.ylen) ] +        self.learnbuffer = []      # calculates Q(x,y)      def eval(self,x,y = None): @@ -224,16 +249,24 @@ class QArray:          return self.Q[y][x] +    def learn(self, oldstate, action, newstate, reward): +        self.learnbuffer += [(oldstate,action,newstate,reward)] +        # self.flush_learnbuffer() # TODO TRYME -    def change(self, s, action, diff): -        self.Q[s[1]][s[0]][action] += diff +    def flush_learnbuffer(self): +        for oldstate, action, newstate, reward in reversed(self.learnbuffer): +            diff,_ = self.value_update(oldstate,action,newstate,reward) +            self.Q[oldstate[1]][oldstate[0]][action] += diff +        self.learnbuffer = []      def episode(self): -        pass +        self.flush_learnbuffer() +        return super().episode()  # implements the Q function not through an array, but through a neuronal network instead. -class QNN: +class QNN (QCommon):      def __init__(self): +        super().__init__()          connection_rate = 1          num_input = 2          hidden = (40,40) @@ -260,8 +293,16 @@ class QNN:          self.NN.train(list(s), [x/10. for x in newval]) -    def episode(self): -        pass +    # learn a transition "from oldstate by action into newstate with reward `reward`" +    # this does not necessarily mean that the action is instantly trained into the function  +    # representation. It may be held back in a list, to be batch-trained lated. +    def learn(self, oldstate, action, newstate, reward): +        diff = self.value_update(oldstate,action,newstate,reward) +        Q.change(oldstate,action,diff) + +    # must be called on every end-of-episode. might trigger batch-training or whatever. +    #def episode(self): +    #    pass  a = World(maze, start) @@ -277,7 +318,6 @@ total_reward = 0.  for i in range(n_episodes):      s = start -    maxdiff=0.      for j in range(100):          # epsilon-greedy          greedy = argmax(Q.eval(s)) @@ -289,18 +329,17 @@ for i in range(n_episodes):              action = greedy          r,ss = a.take_action(s[0],s[1], action) -        #print ((r,ss)) -        diff = alpha * (r + gamma * max( [ Q.eval(ss)[aa] for aa in directions ] ) - Q.eval(s)[action]) -        Q.change(s,action,diff) -        maxdiff = max(abs(diff),maxdiff) +         +        Q.learn(s,action,ss,r) +                  total_reward += r          s = ss          if a.is_final(ss):              break -    Q.episode() +    maxdiff = Q.episode()      if (i % (frameskip+1) == 0): -        print("iteration %.3d, alpha=%.3e, epsilon=%.3e maxdiff=%.7f"%(i,alpha,epsilon,maxdiff)) +        print("iteration %.3d, alpha=%.3e, epsilon=%.3e maxdiff=%.7f"%(i,Q.alpha,epsilon,maxdiff))          n = 0          if visu:              n = visualize(maze,Q) @@ -317,7 +356,7 @@ for i in range(n_episodes):      # However, if we set the friendlyness of our system to 1.0, then it would      # also converge without this learning rate reduction, because we have a      # non-stochastic but a deterministic system now. -    alpha *= alpha_reduction +    Q.alpha *= alpha_reduction      epsilon *= epsilon_reduction @@ -329,7 +368,7 @@ for i in range(n_episodes):      else:          stopstate = 1000 -print("finished after %.3d iterations, alpha=%.3e, epsilon=%.3e"%(i,alpha,epsilon)) +print("finished after %.3d iterations, alpha=%.3e, epsilon=%.3e"%(i,Q.alpha,epsilon))  visualize(maze,Q)  if logfile != None:      logfile.close()  | 
