summaryrefslogtreecommitdiff
path: root/sol.py
diff options
context:
space:
mode:
Diffstat (limited to 'sol.py')
-rw-r--r--sol.py156
1 files changed, 62 insertions, 94 deletions
diff --git a/sol.py b/sol.py
index a90de3b..f98421d 100644
--- a/sol.py
+++ b/sol.py
@@ -48,16 +48,12 @@ arg=args(argv)
print(arg)
mode = None
-if arg['-1'] or arg['--policy-evaluation']:
- mode = 1
-elif arg['-2'] or arg['--q-learning']:
- mode = 2
-else:
+if arg['-h']:
print("Usage: %s MODE [OPTIONS]" % argv[0])
print(" MODE: -1 / --policy-evaluation or\n" +
" -2 / --q-learning\n" +
" OPTIONS: --theta NUM # convergence threshold\n" +
- " # default: %f / %f for -1 / -2\n" % (theta_1, thetha_2) +
+ " # default: %f / %f for -1 / -2\n" % (theta_1, theta_2) +
" --gamma NUM # learning discount for value iteration\n" +
" # default: %f\n" % gamma +
" --alpha NUM # learning rate for q-learning\n" +
@@ -73,7 +69,7 @@ else:
" # default: %f\n\n" % epsilon_reduction +
" --frameskip NUM # frameskip for visualisation\n" +
" # default: %f\n" % frameskip +
- " --quiet # disable visualisation" +
+ " --quiet # disable visualisation\n" +
" --file FILE # output file for q learning")
exit()
@@ -246,96 +242,68 @@ class World:
def is_final(self,s):
return self.maze[s[1]][s[0]] == 2
-if mode == 1: # policy evaluation
- theta = theta_1
- a = World(maze, start)
-
- V = [ [0.0] * a.xlen for i in range(a.ylen) ]
- pi = [ [ [0.25] * 4 for i in range(a.xlen) ] for j in range(a.ylen) ]
-
- i=0
- while True:
- i = i + 1
- delta = 0
- for x,y in product(range(a.xlen), range(a.ylen)):
- v = V[y][x]
- V[y][x] = sum( a.P((x,y),(xx,yy), pi[y][x]) * ( a.R((x,y),(xx,yy), pi[y][x]) + gamma * V[yy][xx] ) for xx,yy in product(range(a.xlen), range(a.ylen)) )
- delta = max(delta, abs(v - V[y][x]))
+theta = theta_2
+
+a = World(maze, start)
+Q = [ [ [0. for k in range(4)] for i in range(a.xlen) ] for j in range(a.ylen) ]
+
+i=0
+stopstate = -1
+total_reward = 0.
+for i in range(1000000):
+ s = start
+ maxdiff=0.
+ for j in range(100):
+ # epsilon-greedy
+ greedy = argmax(Q[s[1]][s[0]])
+ rnd = random()
+ action = None
+ if rnd < epsilon:
+ action = ( greedy + int(1 + 3 * rnd / epsilon) ) % 4
+ else:
+ action = greedy
+
+ r,ss = a.take_action(s[0],s[1], action)
+ #print ((r,ss))
+ diff = alpha * (r + gamma * max( [ Q[ss[1]][ss[0]][aa] for aa in directions ] ) - Q[s[1]][s[0]][action])
+ Q[s[1]][s[0]][action] += diff
+ maxdiff = max(abs(diff),maxdiff)
+ total_reward += r
+ s = ss
+ if a.is_final(ss):
+ break
- print("iteration %.3d, delta=%.7f"%(i,delta))
+ if (i % (frameskip+1) == 0):
+ print("iteration %.3d, alpha=%.3e, epsilon=%.3e maxdiff=%.7f"%(i,alpha,epsilon,maxdiff))
n = 0
if visu:
- n = visualize(maze,V)
+ n = visualize2(maze,Q)
cursor_up(n+2)
- if (delta < theta):
- break
- print("finished after %d iterations" % i)
- visualize(maze,V)
-
-elif True: # q-learning
- theta = theta_2
-
- a = World(maze, start)
- Q = [ [ [0. for k in range(4)] for i in range(a.xlen) ] for j in range(a.ylen) ]
-
- i=0
- stopstate = -1
- total_reward = 0.
- for i in range(1000000):
- s = start
- maxdiff=0.
- for j in range(100):
- # epsilon-greedy
- greedy = argmax(Q[s[1]][s[0]])
- rnd = random()
- action = None
- if rnd < epsilon:
- action = ( greedy + int(1 + 3 * rnd / epsilon) ) % 4
- else:
- action = greedy
-
- r,ss = a.take_action(s[0],s[1], action)
- #print ((r,ss))
- diff = alpha * (r + gamma * max( [ Q[ss[1]][ss[0]][aa] for aa in directions ] ) - Q[s[1]][s[0]][action])
- Q[s[1]][s[0]][action] += diff
- maxdiff = max(abs(diff),maxdiff)
- total_reward += r
- s = ss
- if a.is_final(ss):
- break
-
- if (i % (frameskip+1) == 0):
- print("iteration %.3d, alpha=%.3e, epsilon=%.3e maxdiff=%.7f"%(i,alpha,epsilon,maxdiff))
- n = 0
- if visu:
- n = visualize2(maze,Q)
- cursor_up(n+2)
-
- if (logfile != None):
- print("%d\t%f" % (i, total_reward), file=logfile)
-
- # Wikipedia says on this: "When the problem is stochastic [which it is!],
- # the algorithm still converges under some technical conditions on the
- # learning rate, that require it to decrease to zero.
- # So let's sloooowly decrease our learning rate here. Otherwise it won't
- # converge, but instead oscillate plus/minus 0.5.
- # However, if we set the friendlyness of our system to 1.0, then it would
- # also converge without this learning rate reduction, because we have a
- # non-stochastic but a deterministic system now.
- alpha *= alpha_reduction
- epsilon *= epsilon_reduction
-
-
- # stop once we're below theta for at least 100 episodes. But not before we went above theta at least once.
- if maxdiff < theta:
- stopstate -= 1
- if stopstate == 0:
- break
- else:
- stopstate = 100
+ if (logfile != None):
+ print("%d\t%f" % (i, total_reward), file=logfile)
+
+ # Wikipedia says on this: "When the problem is stochastic [which it is!],
+ # the algorithm still converges under some technical conditions on the
+ # learning rate, that require it to decrease to zero.
+ # So let's sloooowly decrease our learning rate here. Otherwise it won't
+ # converge, but instead oscillate plus/minus 0.5.
+ # However, if we set the friendlyness of our system to 1.0, then it would
+ # also converge without this learning rate reduction, because we have a
+ # non-stochastic but a deterministic system now.
+ alpha *= alpha_reduction
+ epsilon *= epsilon_reduction
+
- print("finished after %.3d iterations, alpha=%.3e, epsilon=%.3e"%(i,alpha,epsilon))
- visualize2(maze,Q)
- if logfile != None:
- logfile.close()
+ # stop once we're below theta for at least 100 episodes. But not before we went above theta at least once.
+ if maxdiff < theta:
+ stopstate -= 1
+ if stopstate == 0:
+ break
+ else:
+ stopstate = 100
+
+print("finished after %.3d iterations, alpha=%.3e, epsilon=%.3e"%(i,alpha,epsilon))
+visualize2(maze,Q)
+if logfile != None:
+ logfile.close()