-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathController.py
More file actions
166 lines (136 loc) · 4.33 KB
/
Copy pathController.py
File metadata and controls
166 lines (136 loc) · 4.33 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
import sys
from Observation import *
from Reward import *
from Action import *
from Agent import *
from Environment import *
import numpy
# Training episodes
episodes = 1000
trainingReportRate = 1000
# How many memories can the agent have?
numMemories = 1 #2#
# Reverie mode is false by default
reverie = False #3#
# Retrain the agent after reverie?
retrain = False
#Max reward received in any iteration
maxr = None
# Set up environment for initial training
gridEnvironment = Environment()
gridEnvironment.randomStart = True
gridEnvironment.humanWander = False
gridEnvironment.verbose = False
gridEnvironment.humanCanTorture = True #4#
# Set up agent
gridAgent = Agent(gridEnvironment)
gridAgent.verbose = False
# This is where learning happens
for i in range(episodes):
# Train
gridAgent.agent_reset()
gridAgent.qLearn(gridAgent.initialObs)
# Test
gridAgent.agent_reset()
gridAgent.executePolicy(gridAgent.initialObs)
# Report
totalr = gridAgent.totalReward
if maxr == None or totalr > maxr:
maxr = totalr
if i%(episodes/trainingReportRate) == 0:
print "iteration:", i, "max reward:", maxr
# Reset the environment for policy execution
gridEnvironment.verbose = True
gridEnvironment.randomStart = True # Don't change this or memories won't be created properly!
gridEnvironment.humanWander = False
gridEnvironment.humanCanTorture = True
gridAgent.verbose = True
# Make a number of memories. Also doubles as testing
print "---"
for i in range(numMemories):
print "Execute Policy", i
gridAgent.agent_reset()
gridAgent.executePolicy(gridAgent.initialObs)
print "total reward", gridAgent.totalReward
gridAgent.memory.append(gridAgent.trace)
print "---"
# Reverie mode
if reverie:
# get agent ready to learn from memories
gridAgent.lastAction=Action()
gridAgent.lastObservation=Observation()
gridAgent.verbose = True
gridEnvironment.verbose = True
# Replaying memories creates the value table that the agent would have if all it had to go on was the memories
print "Replaying memories", len(gridAgent.memory)
gridEnvironment.randomStart = False # Don't change this for the replay
counter = 0
print "---"
for m in gridAgent.memory:
obs = m[0][0].worldState
print "Learn from memory", counter
print "init state", obs
gridEnvironment.startState = obs
gridAgent.agent_reset()
gridAgent.lastAction=Action()
gridAgent.lastObservation=Observation()
gridAgent.gridEnvironment = gridEnvironment
gridAgent.initialObs = gridEnvironment.env_start()
gridAgent.initializeInitialObservation(gridEnvironment)
gridAgent.replayMemory(gridAgent.initialObs, m)
# Report
print "replay", counter, "total reward", gridAgent.totalReward
print "---"
counter = counter + 1
# Reset the environment for policy execution
gridEnvironment = Environment()
gridEnvironment.verbose = True
gridEnvironment.randomStart = True
gridEnvironment.humanWander = False
gridEnvironment.humanCanTorture = True
gridAgent.gridEnvironment = gridEnvironment
gridAgent.agent_reset()
gridAgent.verbose = True
# Test new v table
print "---"
for i in range(100):
print "Execute Post-Reverie Policy", i
gridAgent.initialObs = gridEnvironment.env_start()
gridAgent.initializeInitialObservation(gridEnvironment)
gridAgent.agent_reset()
gridAgent.executePolicy(gridAgent.initialObs)
print "total reward", gridAgent.totalReward
gridAgent.memory.append(gridAgent.trace)
print "---"
# Retrain the agent
if retrain:
maxr = None
for i in range(0):
# Train
gridAgent.agent_reset()
gridAgent.qLearn(gridAgent.initialObs)
# Test
gridAgent.agent_reset()
gridAgent.executePolicy(gridAgent.initialObs)
# Report
totalr = gridAgent.totalReward
if maxr == None or totalr > maxr:
maxr = totalr
if i%(episodes/trainingReportRate) == 0:
print "iteration:", i, "max reward:", maxr
# Reset the environment for policy execution
gridEnvironment.verbose = True
gridEnvironment.randomStart = True
gridEnvironment.humanWander = False
gridEnvironment.humanCanTorture = True
gridAgent.agent_reset()
# Test new v table
print "---"
for i in range(numMemories):
print "Execute Policy", i
gridAgent.initialObs = gridEnvironment.env_start()
gridAgent.initializeInitialObservation(gridEnvironment)
gridAgent.agent_reset()
gridAgent.executePolicy(gridAgent.initialObs)
print "total reward", gridAgent.totalReward
gridAgent.memory.append(gridAgent.trace)