from __future__ import division
from collections import defaultdict
import numpy as np
import MDP
We'll consider a capacitated single-sourcing problem with discrete demand and lost sales, with the following parameters.
p = 4 # per-unit selling price
h = 1 # per-unit holding cost rate
c = 2 # per-unit ordering cost
inv_cap = 2 # max. number of units on-hand
max_order = 2 # max. number of units that can be ordered
# Assumptions:
## - The demand is discrete and uniformly distributed on {0, 1, 2}
## - The lead-time for orders is 0, i.e., they arrive instantaneously.
def SM(x, a, d):
# System Model
return max(0, min(2, x+a)-d)
Let's set up the MDP model.
mdp_dict = {}
for i in range(inv_cap+1):
x = i # The state is the current on-hand inventory level.
for a in range(max_order+1):
revenue = p * (1/3) * sum([min(d, min(inv_cap, x+a)) for d in range(3)])
holding_cost = h * (1/3) * sum([max(0, min(2, x+a)-d) for d in range(3)])
ordering_cost = c * a
reward = revenue - holding_cost - ordering_cost
s = [SM(x, a, d) for d in range(3)]
if s[0]==s[1]==s[2]:
trans_probs = defaultdict(lambda : 0, {s[0] : 1})
elif (s[0]==s[1]) and (s[1]!=s[2]):
trans_probs = defaultdict(lambda : 0, {s[0] : 2/3, s[2] : 1/3})
elif (s[0]==s[2]) and (s[2]!=s[1]):
trans_probs = defaultdict(lambda : 0, {s[0] : 2/3, s[1] : 1/3})
elif (s[1]==s[2]) and (s[2]!=s[0]):
trans_probs = defaultdict(lambda : 0, {s[1] : 2/3, s[0] : 1/3})
elif (s[0]!=s[1]) and (s[1]!=s[2]) and (s[0]!=s[2]):
trans_probs = defaultdict(lambda : 0, {s[0] : 1/3, s[1] : 1/3, s[2] : 1/3})
mdp_dict[(x, a)] = (reward, trans_probs)
mdp = MDP.MDP(mdp_dict)
print 'The number of states is {}.'.format(mdp.n)
print 'The number of state-action pairs is {}.'.format(mdp.m)
Consider the time horizon $T = 3$.
T = 3
Let's use backward induction to compute an optimal policy.
v = np.zeros(3).reshape(3,1)
pi_star = {}
for t in range(1,T+1):
v = MDP.optimality_operator(v, mdp, 1)
#print v
pi_star[t] = MDP.greedy_policy(v, mdp, 1)
#print pi_star[t]
The optimal policy is a base-stock policy with a base-stock level of $s=1$.
pi_star
discount_factor = 0.9
First, let's use value iteration to compute an $\epsilon$-optimal policy, with $\epsilon=0.001$.
pi_eps = MDP.value_iteration(mdp, discount_factor, 0.001)
The $\epsilon$-optimal policy is a base-stock policy.
pi_eps
Next, let's use policy iteration to compute an optimal policy.
pi_star = MDP.policy_iteration(mdp, discount_factor)
v_star = MDP.policy_eval(pi_star, mdp, discount_factor)
It turns out that the $\epsilon$-optimal policy we computed above is optimal.
pi_star
Let's try doing one step of policy improvement, starting with a suboptimal base-stock policy. Let $\pi$ denote the base-stock policy with base-stock level $s=2$.
pi = {0:2, 1:1, 2:0} # base-stock level is 2
v = MDP.policy_eval(pi, mdp, 0.9).reshape(3,1)
print 'The optimality gap of the base-stock policy with base-stock level s=2 is {}.'.format(max(abs(v_star-v))[0])
Now let's do one step of policy iteration, starting with $\pi$.
pi_improved = MDP.greedy_policy(v, mdp, discount_factor)
v_improved = MDP.policy_eval(pi_improved, mdp, discount_factor)
print 'The optimality gap after one policy improvement is {}.'.format(max(abs(v_star-v_improved))[0])
The improved policy is the optimal base-stock policy (with a base-stock level of 1).
pi_improved
Note that, under every stationary policy, state 0 is reachable in one step from every state. Moreover, starting from state 0, states 1 and 2 are both reachable in a finite number of steps. This means that for every stationary policy, the induced Markov chain is irreducible. In other words, the MDP is unichain.
Let's use policy iteration for unichain MDPs to compute an optimal policy.
pi_star = MDP.unichain_policy_iteration(mdp)
The optimal policy is also a base-stock policy!
pi_star