在数字化转型中,选择合适的跨平台开发框架不仅能提高效率,还有助于确保数据安全与合规性。
692
2022-10-02
python value iteration算法玩倒立摆(inverted pendulum)
最近用value iteration的方法实现了一下倒立摆,看一下效果:
pendulum
我这里分享一下我的实现,倒立摆的代码为:
# a few packages we need to importimport numpy as npimport matplotlib.pyplot as pltimport matplotlibimport matplotlib.animation as animationimport IPython class Pendulum: """ This class describes an inverted pendulum and provides some helper functions """ def __init__(self): """ constructor of the class """ #gravity constant self.g=9.81 # number of dimensions (angle and angular velocity) self.state_dims = 2 # the maximum velocity self.vmax = 6. # the range of allowable states self.state_range = np.array([[0, 2*np.pi],[-self.vmax, self.vmax]]) #simulation step self.delta_t = 0.1 # internal integration step self._internaldt = 0.01 self._integration_ratio = int(round(self.delta_t / self._internaldt)) def next_state(self,x,u): """ This function integrates the pendulum for one step of self.delta_t seconds Inputs: x: state of the pendulum (x,v) as a 2D numpy array u: control as a scalar Output: the state of the pendulum as a 2D numpy array at the end of the integration """ x_next = x[0] v_next = x[1] for i in range(self._integration_ratio): xx_next = (x_next + self._internaldt * v_next)%(2*np.pi) v_next = np.clip(v_next + self._internaldt * (u-self.g*np.sin(x_next)), -self.vmax, self.vmax) x_next = xx_next return np.array([x_next,v_next]) def simulate(self, x0, policy, T): """ This function simulates the pendulum for T seconds from initial state x0 using a policy (policy is called as policy(x) and returns one control) Inputs: x0: the initial conditions of the pendulum as a 2D array (angle and velocity) T: the time to integrate for Output: x (2D array) and u (1D array) containing the time evolution of states and control """ horizon_length = int(T/self.delta_t) x=np.empty([2, horizon_length+1]) x[:,0] = x0 u=np.empty([horizon_length]) for i in range(horizon_length): u[i] = policy(x[:,i]) x[:,i+1] = self.next_state(x[:,i], u[i]) return x, u def animate_robot(self, x, dt = 0.01): """ This function makes an animation showing the behavior of the pendulum takes as input the result of a simulation - dt is the sampling time (0.1s normally) """ # here we check if we need to down-sample the data for display #downsampling (we want 100ms DT or higher) min_dt = 0.1 if(dt < min_dt): steps = int(min_dt/dt) use_dt = int(min_dt * 1000) else: steps = 1 use_dt = int(dt * 1000) plotx = x[:,::steps] fig = matplotlib.figure.Figure(figsize=[6,6]) matplotlib.backends.backend_agg.FigureCanvasAgg(fig) ax = fig.add_subplot(111, autoscale_on=False, xlim=[-1.3,1.3], ylim=[-1.3,1.3]) ax.grid() list_of_lines = [] #create the cart pole line, = ax.plot([], [], 'k', lw=2) list_of_lines.append(line) line, = ax.plot([], [], 'o', lw=2) list_of_lines.append(line) cart_height = 0.25 def animate(i): for l in list_of_lines: #reset all lines l.set_data([],[]) x_pend = np.sin(plotx[0,i]) y_pend = -np.cos(plotx[0,i]) list_of_lines[0].set_data([0., x_pend], [0., y_pend]) list_of_lines[1].set_data([x_pend, x_pend], [y_pend, y_pend]) return list_of_lines def init(): return animate(0) ani = animation.FuncAnimation(fig, animate, np.arange(0, len(plotx[0,:])), interval=use_dt, blit=True, init_func=init) plt.close(fig) plt.close(ani._fig) IPython.display.display_html(IPython.core.display.HTML(ani.to_html5_video()))
value iteration
定义配置和辅助函数
nq=50nv=50nu = 3v_max = 6u_max=5# create lookup tables for discretized statesu_table = np.linspace(-u_max, u_max, nu)q_table = np.linspace(0., 2*np.pi, nq, endpoint=False)v_table = np.linspace(-v_max, v_max, nv)num_states = nq * nvrobot = Pendulum()def get_index(x): ind_q = np.argmin((x[0]-q_table)**2) ind_v = np.argmin((x[1]-v_table)**2) return ind_q + ind_v*nqdef get_states(index): iv,ix = np.divmod(index, nq) return np.array([q_table[ix], v_table[iv]]) next_state_index = np.empty([num_states, nu], dtype=np.int32)for i in range(num_states): for k in range(nu): x_next = robot.next_state(get_states(i),u_table[k]) next_state_index[i,k] = get_index(x_next)
定义代价函数
def cost(x,u): """ a cost function for the inverted pendulum """ return (x[0]-np.pi)**2 + 0.01*x[1]**2 + 0.0001*u**2
value iteration的实现
class ValueIteration: """ This class is used to implement value iteration and store the state of the value function and policy as we iterate """ def __init__(self, model,num_states, cost, discount_factor=0.99): """ receives as input a pendulum and cost function and potentially a discount factor """ # value function stored as a 1D array (indexed as we indexed states in pendulum) self.num_states=num_states self.value_function = np.zeros([self.num_states]) # we also store the policy similarly self.policy = np.zeros([self.num_states]) # references to the pendulum and cost function self.model = model self.cost = cost #discount factor for cost self.gamma = discount_factor def iterate(self,num_states,nu,u_table, num_iter=1): """ the main iteration of value iteration num_iter: maximum number of iterations to be performed. If after an iteration the value function does not change (e.g. less thant 10e-5) the function returns and print success """ for i in range(num_iter): J_new = self.value_function.copy() for j in range(num_states): #for each possible control input we compute the cost r = np.zeros([nu]) for l in range(nu): # the current states and control x = get_states(j) u = u_table[l] # the index for the next state next_index = next_state_index[j,l] #compute the cost r[l] = self.cost(x, u) + self.gamma*self.value_function[next_index] # we take the smallest cost value to update the value function J_new[j] = np.min(r) #here we also store the policy (so we have it for later) self.policy[j] = u_table[np.argmin(r)] #we update the current value function if there is any change otherwise we are done if ((self.value_function-J_new)**2 < 10e-2).all(): print("CONVERGED after iteration " + str(i)) break else: self.value_function = J_new.copy()
训练
# we instanciate a value iteration object for a pendulum model and a cost functionvalue_iteration = ValueIteration(robot,num_states, cost)# we run the iterations (with maximum number 2000).value_iteration.iterate(num_states,nu,u_table,2000)
可视化
my_policy=value_iteration.policydef policy(x):# print(x) index=get_index(x) return my_policy[index]
def plot_results(robot, value_function, policy,my_policy, animate=True): """ This function plots the results. It displays the value function, the policy for all states. Then it integrates the pendulum from state [0,0] and displays the states and control as a function of time Finally it shows an animation of the result """ x0 = np.array([0.,0.]) x, u = robot.simulate(x0, policy, 20) if animate: robot.animate_robot(x, robot.delta_t)
plot_results(robot, value_iteration.value_function, policy,my_policy, animate=True)
后续把跑的视频摆出来,哈哈哈,今天分享就到这里,这是实践代码,理论部分就不讲了哈。
版权声明:本文内容由网络用户投稿,版权归原作者所有,本站不拥有其著作权,亦不承担相应法律责任。如果您发现本站中有涉嫌抄袭或描述失实的内容,请联系我们jiasou666@gmail.com 处理,核实后本网站将在24小时内删除侵权内容。
发表评论
暂时没有评论,来抢沙发吧~