diff options
-rw-r--r-- | sol.py | 156 |
1 files changed, 62 insertions, 94 deletions
@@ -48,16 +48,12 @@ arg=args(argv) print(arg) mode = None -if arg['-1'] or arg['--policy-evaluation']: - mode = 1 -elif arg['-2'] or arg['--q-learning']: - mode = 2 -else: +if arg['-h']: print("Usage: %s MODE [OPTIONS]" % argv[0]) print(" MODE: -1 / --policy-evaluation or\n" + " -2 / --q-learning\n" + " OPTIONS: --theta NUM # convergence threshold\n" + - " # default: %f / %f for -1 / -2\n" % (theta_1, thetha_2) + + " # default: %f / %f for -1 / -2\n" % (theta_1, theta_2) + " --gamma NUM # learning discount for value iteration\n" + " # default: %f\n" % gamma + " --alpha NUM # learning rate for q-learning\n" + @@ -73,7 +69,7 @@ else: " # default: %f\n\n" % epsilon_reduction + " --frameskip NUM # frameskip for visualisation\n" + " # default: %f\n" % frameskip + - " --quiet # disable visualisation" + + " --quiet # disable visualisation\n" + " --file FILE # output file for q learning") exit() @@ -246,96 +242,68 @@ class World: def is_final(self,s): return self.maze[s[1]][s[0]] == 2 -if mode == 1: # policy evaluation - theta = theta_1 - a = World(maze, start) - - V = [ [0.0] * a.xlen for i in range(a.ylen) ] - pi = [ [ [0.25] * 4 for i in range(a.xlen) ] for j in range(a.ylen) ] - - i=0 - while True: - i = i + 1 - delta = 0 - for x,y in product(range(a.xlen), range(a.ylen)): - v = V[y][x] - V[y][x] = sum( a.P((x,y),(xx,yy), pi[y][x]) * ( a.R((x,y),(xx,yy), pi[y][x]) + gamma * V[yy][xx] ) for xx,yy in product(range(a.xlen), range(a.ylen)) ) - delta = max(delta, abs(v - V[y][x])) +theta = theta_2 + +a = World(maze, start) +Q = [ [ [0. for k in range(4)] for i in range(a.xlen) ] for j in range(a.ylen) ] + +i=0 +stopstate = -1 +total_reward = 0. +for i in range(1000000): + s = start + maxdiff=0. + for j in range(100): + # epsilon-greedy + greedy = argmax(Q[s[1]][s[0]]) + rnd = random() + action = None + if rnd < epsilon: + action = ( greedy + int(1 + 3 * rnd / epsilon) ) % 4 + else: + action = greedy + + r,ss = a.take_action(s[0],s[1], action) + #print ((r,ss)) + diff = alpha * (r + gamma * max( [ Q[ss[1]][ss[0]][aa] for aa in directions ] ) - Q[s[1]][s[0]][action]) + Q[s[1]][s[0]][action] += diff + maxdiff = max(abs(diff),maxdiff) + total_reward += r + s = ss + if a.is_final(ss): + break - print("iteration %.3d, delta=%.7f"%(i,delta)) + if (i % (frameskip+1) == 0): + print("iteration %.3d, alpha=%.3e, epsilon=%.3e maxdiff=%.7f"%(i,alpha,epsilon,maxdiff)) n = 0 if visu: - n = visualize(maze,V) + n = visualize2(maze,Q) cursor_up(n+2) - if (delta < theta): - break - print("finished after %d iterations" % i) - visualize(maze,V) - -elif True: # q-learning - theta = theta_2 - - a = World(maze, start) - Q = [ [ [0. for k in range(4)] for i in range(a.xlen) ] for j in range(a.ylen) ] - - i=0 - stopstate = -1 - total_reward = 0. - for i in range(1000000): - s = start - maxdiff=0. - for j in range(100): - # epsilon-greedy - greedy = argmax(Q[s[1]][s[0]]) - rnd = random() - action = None - if rnd < epsilon: - action = ( greedy + int(1 + 3 * rnd / epsilon) ) % 4 - else: - action = greedy - - r,ss = a.take_action(s[0],s[1], action) - #print ((r,ss)) - diff = alpha * (r + gamma * max( [ Q[ss[1]][ss[0]][aa] for aa in directions ] ) - Q[s[1]][s[0]][action]) - Q[s[1]][s[0]][action] += diff - maxdiff = max(abs(diff),maxdiff) - total_reward += r - s = ss - if a.is_final(ss): - break - - if (i % (frameskip+1) == 0): - print("iteration %.3d, alpha=%.3e, epsilon=%.3e maxdiff=%.7f"%(i,alpha,epsilon,maxdiff)) - n = 0 - if visu: - n = visualize2(maze,Q) - cursor_up(n+2) - - if (logfile != None): - print("%d\t%f" % (i, total_reward), file=logfile) - - # Wikipedia says on this: "When the problem is stochastic [which it is!], - # the algorithm still converges under some technical conditions on the - # learning rate, that require it to decrease to zero. - # So let's sloooowly decrease our learning rate here. Otherwise it won't - # converge, but instead oscillate plus/minus 0.5. - # However, if we set the friendlyness of our system to 1.0, then it would - # also converge without this learning rate reduction, because we have a - # non-stochastic but a deterministic system now. - alpha *= alpha_reduction - epsilon *= epsilon_reduction - - - # stop once we're below theta for at least 100 episodes. But not before we went above theta at least once. - if maxdiff < theta: - stopstate -= 1 - if stopstate == 0: - break - else: - stopstate = 100 + if (logfile != None): + print("%d\t%f" % (i, total_reward), file=logfile) + + # Wikipedia says on this: "When the problem is stochastic [which it is!], + # the algorithm still converges under some technical conditions on the + # learning rate, that require it to decrease to zero. + # So let's sloooowly decrease our learning rate here. Otherwise it won't + # converge, but instead oscillate plus/minus 0.5. + # However, if we set the friendlyness of our system to 1.0, then it would + # also converge without this learning rate reduction, because we have a + # non-stochastic but a deterministic system now. + alpha *= alpha_reduction + epsilon *= epsilon_reduction + - print("finished after %.3d iterations, alpha=%.3e, epsilon=%.3e"%(i,alpha,epsilon)) - visualize2(maze,Q) - if logfile != None: - logfile.close() + # stop once we're below theta for at least 100 episodes. But not before we went above theta at least once. + if maxdiff < theta: + stopstate -= 1 + if stopstate == 0: + break + else: + stopstate = 100 + +print("finished after %.3d iterations, alpha=%.3e, epsilon=%.3e"%(i,alpha,epsilon)) +visualize2(maze,Q) +if logfile != None: + logfile.close() |