悬崖寻路问题(CliffWalking)是指在一个4 x 12的网格中,智能体以网格的左下角位置为起点,以网格的下角位置为终点,目标是移动智能体到达终点位置,智能体每次可以在上、下、左、右这4个方向中移动一步,每移动一步会得到-1单位的奖励。
代码实现:
env = gym.make("CliffWalking-v0") # 0 up, 1 right, 2 down, 3 left
env = CliffWalkingWapper(env)
agent = QLearning(obs_dim=env.observation_space.n,action_dim=env.action_space.n,learning_rate=cfg.policy_lr,gamma=cfg.gamma,epsilon_start=cfg.epsilon_start,epsilon_end=cfg.epsilon_end,epsilon_decay=cfg.epsilon_decay)
render = False # 是否打开GUI画面
rewards = [] # 记录所有episode的reward
MA_rewards = [] # 记录滑动平均的reward
steps = []# 记录所有episode的steps
for i_episode in range(1,cfg.max_episodes+1):ep_reward = 0 # 记录每个episode的rewardep_steps = 0 # 记录每个episode走了多少stepobs = env.reset() # 重置环境, 重新开一局(即开始新的一个episode)while True:action = agent.sample(obs) # 根据算法选择一个动作next_obs, reward, done, _ = env.step(action) # 与环境进行一个交互# 训练 Q-learning算法agent.learn(obs, action, reward, next_obs, done) # 不需要下一步的actionobs = next_obs # 存储上一个观察值ep_reward += rewardep_steps += 1 # 计算step数if render:env.render() #渲染新的一帧图形if done:breaksteps.append(ep_steps)rewards.append(ep_reward)# 计算滑动平均的rewardif i_episode == 1:MA_rewards.append(ep_reward)else:MA_rewards.append(0.9*MA_rewards[-1]+0.1*ep_reward)print('Episode %s: steps = %s , reward = %.1f, explore = %.2f' % (i_episode, ep_steps,ep_reward,agent.epsilon))# 每隔20个episode渲染一下看看效果if i_episode % 20 == 0:render = Trueelse:render = False
agent.save() # 训练结束,保存模型