Introduction

I tried to solve the best route search problem with a python program. As a textbook ["Reinforcement Learning"](http://www.amazon.co.jp/ Reinforcement Learning-Richard-S-Sutton/dp/4627826613/ref=sr_1_1?ie=UTF8&qid=1463965873&sr=8-1&keywords=Reinforcement Learning % E3% 80% 80 Mikami) was used.

Structure of this article

Introduction
Search for the best route
Rules
Update Q value
Implementation
Result
in conclusion

Search for the best route

rule

Consider the search problem in the maze as shown in the figure. The white circle is the agent, the red is the minus area, and the blue is the goal. The search problem here refers to searching for a route that maximizes the reward for reaching the goal. The rules are shown below.

Maze size: $ 6 \ times6 $
Rewards in each area: Black area $ 0 $, Red area $ -1 $, Blue area $ + 3 $
When you reach the blue goal, start over from the starting point on the upper left.

Update Q value

Update the $ Q $ value with the following formula.

Q_{new}(s_t, a) = Q_{old}(s_t, a) + \alpha\bigl[r_{t+1} + \gamma max_{a'}Q(s_{t+1}, a') - Q_{old}(s_t, a)\bigr]

$ Q (s_t, a) $ represents the value of the action $ a $ in a certain state $ s_t $. The action $ a $ in the state $ s_t $ gives the reward $ r_ {t + 1} $. When updating the $ Q $ value, the latest reward $ r_ {t + 1} $ is added to the maximum value obtained by the action $ a'$ in the next state $ s_ {t + 1} $. Add the product multiplied by gamma $. This means that the action in a certain state is selected by considering not only the latest reward but also the value in the subsequent state. $ \ Gamma $ is called the discount rate, and $ \ alpha $ is called the learning rate.

Implementation

I implemented it in python. The probability of taking a random action is $ \ varepsilon = 0.1 $, the learning rate is $ \ alpha = 0.2 $, and the discount rate is $ \ gamma = 0.9 $. Learning ends when you reach the goal of $ 300 $.

.
|- agent.py agent class
|- mazeimage.py Display / save class
|- main.py
|- resources
      |- maze.csv

`agent.py`


import numpy
import cv2
import random
import copy

class Agent:

	# constructor
	def __init__(self, maze_shape):
		self.state = numpy.array([0, 0])
		self.actions = self.__create_actions(maze_shape)
		self.moves = {0: numpy.array([0, -1]), 1: numpy.array([1, 0]), 2: numpy.array([0, 1]), 3: numpy.array([-1, 0])}
		self.q = numpy.zeros((4, ) + maze_shape)


# public method
	def act(self, maze, epsilon, alpha, gamma):
		act_index = self.__select_action(epsilon)
		move = self.moves.get(act_index)
		reward = maze[tuple(self.state + move)]
		self.update_q(act_index, move, reward, alpha, gamma)
		self.state += move


	def update_q(self, act_index, move, reward, alpha, gamma):
		y, x = self.state
		_q = self.q[act_index, y, x]
		self.q[act_index, y, x] = _q + alpha * (reward + gamma * self.__get_max_q(move) - _q)


	def goal(self, maze_shape):
		return numpy.array_equal(self.state, numpy.array(maze_shape) - 1)


	def reset(self):
		self.state = numpy.array([0, 0])


# private method
	def __create_actions(self, maze_shape):
		actions = []
		maze_h, maze_w = maze_shape
		for j in range(maze_h):
			actions.append([])
			for i in range(maze_w):
				action = [0, 1, 2, 3]
				self.__remove_actions(action, maze_h, maze_w, j, i)
				actions[j].append(action)

		return numpy.array(actions)


	def __remove_actions(self, action, maze_h, maze_w, j, i):
		if i == 0:
			action.remove(0)
		if i == maze_w - 1:
			action.remove(2)
		if j == 0:
			action.remove(3)
		if j == maze_h - 1:
			action.remove(1)

	
	def __select_action(self, epsilon):
		y, x = self.state
		action = copy.deepcopy(self.actions[y, x])
		if numpy.random.rand() > epsilon:
			mode = '!!!greedy!!!'
			act_index = self.__select_greedy_action(action)
		else:
			mode = '!!!random!!!'
			act_index = self.__select_random_action(action)

		print u'%s  state: (%d, %d), action: %d' % (mode, y, x, act_index)

		return act_index


	def __select_greedy_action(self, action):
		y, x = self.state
		_max = self.q[action, y, x].max()
		_indexes = list(numpy.argwhere(self.q[action, y, x] == _max))
		random.shuffle(_indexes)
		return action[_indexes[0]]


	def __select_random_action(self, action):
		random.shuffle(action)
		return action[0]


	def __get_max_q(self, move):
		y, x = self.state + move
		move = self.actions[y, x]
		return self.q[move, y, x].max()

`mazeimage.py`


import numpy
import cv2

class MazeImage:

	# constructor
	def __init__(self, maze, height, width):
		self.height, self.width = (height, width)
		self.maze_h, self.maze_w = maze.shape
		self.ystride = height / self.maze_h
		self.xstride = width / self.maze_w
		self.map_org = self.__create_image(maze)
		self.map_now = self.map_org
		self.writer = cv2.VideoWriter('q_learning.mv4', cv2.cv.CV_FOURCC('m', 'p', '4', 'v'), 30, (self.map_org.shape[1], self.map_org.shape[0]))


# public method
	def show(self, agent):
		self.map_now = self.map_org.copy()
		_y, _x = agent.state
		center = (int((_x + 0.5) * self.xstride), int((_y + 0.5) * self.ystride))
		cv2.circle(self.map_now, center, 11, (255, 255, 255), -1, cv2.cv.CV_AA)
		cv2.imshow('', self.map_now)
		return cv2.waitKey(10)


	def save_movie(self):
		self.writer.write(self.map_now)


	def shortest_path(self, q):
		shortest_map = self.map_org.copy()
		q_arrow = numpy.vectorize(lambda x: {0: '<', 1: 'V', 2: '>', 3: '^'}.get(x))(q.argmax(axis = 0))

		for j in range(self.maze_h):
			for i in range(self.maze_w):
				pt = (int(self.xstride * (i + 0.35)), int(self.ystride * (j + 0.6)))
				cv2.putText(shortest_map, q_arrow[j, i], pt, cv2.FONT_HERSHEY_PLAIN, 1, [255, 255, 255])

		return shortest_map


# private method
	def __create_image(self, maze):
		image = numpy.zeros((self.height, self.width, 3)).astype('uint8')

		for j in range(self.maze_h):
			for i in range(self.maze_w):
				tl = (self.xstride * i, self.ystride * j)
				br = (self.xstride * (i + 1) - 1, self.ystride * (j + 1) - 1)
				cv2.rectangle(image, tl, br, self.__color(maze[j, i]), -1)

		return image


	def __color(self, score):
		if score == 0.0:
			return [0, 0, 0]
		elif score == -1.0:
			return [0, 0, 127]
		else:
			return [127, 0, 0]

`main.py`


from agent import *
from mazeimage import *

if __name__ == '__main__':

	# init
	epsilon = 0.1
	alpha = 0.2
	gamma = 0.9
	maze = numpy.loadtxt('./resources/maze.csv', delimiter = ',')
	agent = Agent(maze.shape)
	maze_image = MazeImage(maze, 300, 300)

	trial = 0
	while True:
		if maze_image.show(agent) == 27:
			print '!!!escape!!!'
			break

		agent.act(maze, epsilon, alpha, gamma)
		maze_image.save_movie()
		
		if agent.goal(maze.shape):
			print '\033[32m' + '!!!goal!!!' + '\033[0m'
			trial += 1
			print 'next trial: %d' % trial
			agent.reset()

		if trial == 300:
			break

	maze_image.save_movie()
	cv2.imwrite('shortest.png', maze_image.shortest_path(agent.q))
	cv2.destroyAllWindows()

maze.csv


-1	0	0	0
-1	-1	-1	0
-1	-1	0	0
-1	-1	0	-1
-1	-1	0	0
0	-1	-1	3

result

The figure of the best route obtained by learning is shown. You can learn the route to reach the goal without passing through the negative area. I implemented it so that the state of learning is saved as a video, so if you are interested, please move it.

in conclusion

I was able to solve the best route search problem. As the update formula of $ Q (s_t, a) $ shows, it is understood that the value exudes from the goal side by considering the value in the state of $ 1 $ ahead.

[PYTHON] [Reinforcement learning] Search for the best route