论文代码解读 Hierarchical Reinforcement Learning for Scarce Medical Resource Allocation

Posted UQI-LIUWJ

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了论文代码解读 Hierarchical Reinforcement Learning for Scarce Medical Resource Allocation相关的知识,希望对你有一定的参考价值。

论文解读 论文笔记 Hierarchical Reinforcement Learning for Scarce Medical Resource Allocation_UQI-LIUWJ的博客-CSDN博客

代码部分

KYHKL-Q/Hierarchical-RL (github.com) 

1 RL_train.py

训练强化学习模块的部分

由于论文中提及了口罩和床位两个指标,两个的模型在很多方面都是类似的,所以为了方便起见,我只设计口罩部分

1.1 导入库

import numpy as np
import json
import torch
import random
import copy
import os
from torch import nn,optim
from simulator import simulator
from reward_fun import reward_fun

1.2 RL_train 类

1.2.1 __init__

def __init__(self,
                action_lr=0.001,           #action ,也就是论文中显著性排序DQN的学习率
                Actor_lr=0.001,            #actor(生成满足因子)的学习率
                Critic_lr=0.002,           #critic的学习率
                decay_rate=0.5,            #未来奖励的折现率 
                batch_size=8,              #batch_size
                pool_volume=128,           #经验回放 buffer的大小
                action_explore_factor=0.5, #action 探索的比例 (ε-greedy)
                Actor_explore_factor=0.05, #actor探索的比例 (ε-greedy)
                soft_replace_rate=0.1,     #软更新,eval的参数对target net的影响
                                           #DDPG问题中,每次target net向eval net靠近的幅度
                train_steps=150,           #总共要训练的轮数(类似于epoch-number)
                interval=48,               #每隔多久更新一次八状态
                mask_total=1000000,        #总的口罩数量
                mask_quality=0.9,          #医用口罩有效过滤病毒的比例
                mask_lasting=48,           #口罩有效期
                city='city_sample'         #研究区域的名称
                ):
        print('Initializing...')

        self.action_lr=action_lr
        self.Actor_lr=Actor_lr
        self.Critic_lr=Critic_lr
        self.decay_rate=decay_rate        
        self.batch_size=batch_size
        self.pool_volume=pool_volume
        self.action_explore_factor=action_explore_factor
        self.Actor_explore_factor=Actor_explore_factor
        self.soft_replace_rate=soft_replace_rate        
        self.train_steps=train_steps
        self.interval=interval
        self.mask_total=mask_total
        self.mask_quality=mask_quality
        self.mask_lasting=mask_lasting
        self.city=city

        #Initialize state and replay buffer
        with open(os.path.join('../data',self.city,'start.json'),'r') as f:
            self.start = np.array(json.load(f))
        #由于提供的代码里面没有这个文件,我猜测是初始情况下各区域各状态的人数
        ##是一个region_num*8的二维数组

        from Net.Anet import Anet
        #actor network————>计算满足因子

        from Net.Qnet import Qnet
        #action network————>计算选择哪个显著性排序

        from Net.RNN import RNN
        #用可以求得的三个状态
        '''
        It(已经感染疾病,同时被检测出疾病的人。)
        Ih(被送医治疗的感染者)
        D(死亡人群)
        '''
        #计算其他三个环境可看到的状态的值
        '''
        E(暴露人群)
        Iu(已经感染疾病,但是没有检测的人。)
        R(康复人群)
        '''

        from Net.Cnet import Cnet
        #critic network

        self.region_num = 161
        #Supposed to be modified according to the number of regions in the studied city.
        #区划的数量

        self.pool_pointer=0
        #经验回放 relay buffer 下一个下标(下一组记录放到哪里)
        self.pool_count=0
        #目前relay buffer中的记录个数
        self.pool_state=np.zeros([self.pool_volume,self.region_num,8])
        #relay buffer中的状态数
        #buffer_size * region_num * 8
        self.pool_mask_action=np.zeros(self.pool_volume)
        #relay buffer 中选择了哪个显著性排序
        self.pool_mask_perc=np.zeros(self.pool_volume)
        #relay buffer 中 口罩的满足因子
        self.pool_reward=np.zeros(self.pool_volume)
        #relay buffer 中的单步奖励
        self.pool_state1=np.zeros([self.pool_volume,self.region_num,8])
        #relay buffer 中的下一状态

        self.current_state=self.start
        #目前的状态
        self.next_state=self.current_state
        #下一状态
        
        #Initialize the networks
        torch.manual_seed(1234)
        torch.cuda.manual_seed(1234)
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
                    
        self.Mask_action_eval=Qnet().to(self.device)
        self.Mask_action_eval.train()
        self.Mask_action_target=copy.deepcopy(self.Mask_action_eval)
        #深拷贝,也就是二者不共享参数
        #一个是eval network 一个是target network
        self.Mask_action_target.eval()
        #表示target 不参与训练(后同)  
self.Mask_action_optimizer=optim.Adam(self.Mask_action_eval.parameters(),lr=self.action_lr)
        #action 指的是论文模型示意图中的DQN,也就是学习显著性排序的部分

        self.Mask_Actor_eval=Anet().to(self.device)
        self.Mask_Actor_eval.train()
        self.Mask_Actor_target=copy.deepcopy(self.Mask_Actor_eval)      
        self.Mask_Actor_target.eval()
                 
         self.Mask_Actor_optimizer=optim.Adam(self.Mask_Actor_eval.parameters(),lr=self.Actor_lr)
        #actor 就是论文模型示意图下面的actor


        self.Mask_Critic_eval=Cnet(self.region_num).to(self.device)
        self.Mask_Critic_eval.train()
        self.Mask_Critic_target=copy.deepcopy(self.Mask_Critic_eval)
        self.Mask_Critic_eval.eval()     
         self.Mask_Critic_optimizer=optim.Adam(self.Mask_Critic_eval.parameters(),lr=self.Critic_lr)
        #critic 就是论文模型示意图下面的critic


        #Initialize the simulator
        #各区域疾病状态转换
        self.simulator=simulator(city=self.city)

        self.simulator.reset(self.start)
        #将start中各区域各状态的初始人数设置到simulater 的nodes列表中
        #nodes列表的每一个元素都是一个node对象
        #node对象有一个id,以及8状态(S,E,Iu,It,Ia,Ih,R,D)
        print('Initializing down!')

1.2.2 train

def train(self):
        print('Start training...')
        loss_record = list()
        train_count = 0 #目前训练的轮数

        while train_count < self.train_steps:
            self.current_state=self.start
            ##由于提供的代码里面没有这个文件,我猜测start是初始情况下各区域各状态的人数
            ##是一个region_num*8的二维数组

            self.next_state=self.current_state
            #下一状态

            self.is_end=False
            #是否结束episode
            end_count = 0

            #sampling
            step_count=0
            while ((not self.is_end) and
                   step_count <= 60/(self.interval/48)):

                mask_action_out = self.Mask_action_eval(
                    torch.FloatTensor(self.current_state[:,[1,2,3,5,6,7]])
                        .to(self.device).unsqueeze(0))
                '''
                 选取E,Iu,It,Ih,R,D
                 同时升维度至[1,region_num,6]
                 action 指的是论文模型示意图中的DQN,也就是学习显著性排序的部分
                 mask 就是指看口罩
                 
                 mask_action_out->[1,7]
                 这里是表示不同的排序原则的“打分”
                 论文中一共出现了3中不同的排序原则,其中可以任意组合,所以有2^3-1=7种(不能都不选)
                '''



                #select an action through epsilon-greedy
                mask_action=torch.argmax(mask_action_out).cpu().item()
                #选择“打分最高的”显著性排序原则
                rand_temp=random.random()
                if(rand_temp>(1-self.action_explore_factor)):
                    mask_action = int(7 * random.random())
                    #一定比例是随机探索,否则就是利用
                print('mask_action:'.format(mask_action))




                mask_perc = self.Mask_Actor_eval(
                    torch.FloatTensor(self.current_state[:,[1,2,3,5,6,7]])
                        .to(self.device).unsqueeze(0))
                #选取E,Iu,It,Ih,R,D
                #同时升维度至[1,region_num,6]
                #mask_perc是一个[1,1]的0~1之间的数,表示的是论文里提到的满足因子f


                mask_perc = mask_perc.squeeze(0).cpu().detach().item()\\
                            + self.Actor_explore_factor * np.random.randn()
                mask_perc_clip = np.clip(mask_perc, 0.1, 1)
                print('mask_perc:'.format(mask_perc_clip))
                #我们通过actor算出来的比例,加上一定探索的概率,得到最终的满足因子f
                #与此同时,满足因子f需要在0.1~1之间


               

                #get the next state through simulation
                self.next_state, _ = self.simulator.simulate(
                    sim_type='Policy_a',
                    interval=self.interval,
                    bed_total=self.bed_total,
                    bed_action=bed_action,
                    bed_satisfy_perc=bed_perc_clip,
                    mask_on=True,
                    mask_total=self.mask_total,
                    mask_quality=self.mask_quality,
                    mask_lasting=self.mask_lasting,
                    mask_action=mask_action,
                    mask_satisfy_perc=mask_perc_clip)
                #各个区域新的状态

                if(self.simulator.full and end_count==0):
                    end_count=1
                #full表示床位有没有满
                if((not self.simulator.full) and end_count==1):
                    self.is_end=True
                print('Is_end:'.format(self.is_end))

                #get single step reward
                reward=reward_fun(self.current_state,self.next_state)
                #单步奖励函数
                print('Reward:'.format(reward))



                #put the sample into the replay buffer
                #将数据放入经验回放中
                if(self.simulator.full):
                    self.pool_state[self.pool_pointer]=self.current_state
                    self.pool_bed_action[self.pool_pointer]=bed_action
                    self.pool_mask_action[self.pool_pointer]=mask_action
                    self.pool_bed_perc[self.pool_pointer]=bed_perc
                    self.pool_mask_perc[self.pool_pointer]=mask_perc
                    self.pool_reward[self.pool_pointer]=reward
                    self.pool_state1[self.pool_pointer]=self.next_state
                    self.pool_pointer=(self.pool_pointer+1)%self.pool_volume
                    if (self.pool_count<self.pool_volume):
                        self.pool_count=self.pool_count+1
                    print('Sampling: sampeles in pool now'.format(self.pool_count))
                #将当前的状态、床位和口罩使用哪种显著性排序、床位和口罩的满足因子
                #单步奖励,下一状态 送到relay buffer中




                #mini batch sampling and updating the parameters
                if (self.pool_count>=self.batch_size):
                    sample_index=random.sample(range(self.pool_count),self.batch_size)
                    #从relay buffer中随机选择batch_size个index
                    sample_state=torch.FloatTensor(self.pool_state[sample_index]).to(self.device)
                    #选择的batch中的当前状态
                    sample_bed_action=self.pool_bed_action[sample_index]
                    #选择的batch中的床位显著性排序
                    sample_mask_action=self.pool_mask_action[sample_index]
                    #选择的batch中的口罩显著性排序
                    sample_reward=self.pool_reward[sample_index]
                    #选择的batch中的单步奖励
                    sample_bed_perc=torch.FloatTensor(self.pool_bed_perc[sample_index]).to(self.device).unsqueeze(1)
                    #选择的batch中的床位满足因子
                    sample_mask_perc=torch.FloatTensor(self.pool_mask_perc[sample_index]).to(self.device).unsqueeze(1)
                    #选择的batch中的口罩满足因子
                    sample_state1=torch.FloatTensor(self.pool_state1[sample_index]).to(self.device)
                    ##选择的batch中的下一个状态






                    mask_action_eval=self.Mask_action_eval(sample_state[:,:,[1,2,3,5,6,7]])
                    #显著性排序的eval_net
                    #sample_state[:,:,[1,2,3,5,6,7]]————>[batch_size,region_num,6]
                    #结果为[batch_size,7]
                    mask_action_target=self.Mask_action_target(sample_state1[:,:,[1,2,3,5,6,7]])
                    #显著性排序的target_net
                    #sample_state[:,:,[1,2,3,5,6,7]]————>[batch_size,region_num,6]
                    #结果为[batch_size,7]
                    mask_action_max = torch.argmax(mask_action_target, dim=-1).cpu()
                    #batch_size 每个最大的action,也就是bacth中每个记录应该选择哪一种显著性排序



                    mask_perc_eval=self.Mask_Actor_eval(sample_state[:,:,[1,2,3,5,6,7]])
                    #满足因子的eval_net
                    #sample_state[:,:,[1,2,3,5,6,7]]————>[batch_size,region_num,6]
                    #结果为[batch_size,1]
                    mask_perc_target=self.Mask_Actor_target(sample_state1[:,:,[1,2,3,5,6,7]])
                    #满足因子的target_net
                    #sample_state[:,:,[1,2,3,5,6,7]]————>[batch_size,region_num,6]
                    #结果为[batch_size,1]
                    mask_reward_eval = self.Mask_Critic_eval(sample_state[:,:,[1,2,3,5,6,7]], sample_mask_perc)
                    #critic的eval_net
                    #sample_state[:,:,[1,2,3,5,6,7]]————>[batch_size,region_num,6]
                    #结果为[batch_size,1]
                    mask_reward_target=self.Mask_Critic_target(sample_state1[:,:,[1,2,3,5,6,7]],mask_perc_target)
                    #critic的eval_net
                    #sample_state[:,:,[1,2,3,5,6,7]]————>[batch_size,region_num,6]
                    #结果为[batch_size,1]



                    loss = 0
                    for i in range(self.batch_size):
                        y = sample_reward[i]
                        y = y + \\
                            self.decay_rate * (
                                    mask_action_target[i][mask_action_max[i]] +
                                    mask_reward_target[i])
                        #显著性排序中最大的那个的值+critic reward时的值
                        loss = loss + \\
                               (mask_reward_eval[i] +
                                mask_action_eval[i][int(sample_mask_action[i])]
                                - y)** 2
                        #和eval的均方差
                    loss=loss/self.batch_size

                    print('Loss:'.format(loss.cpu().item()))
                    self.Mask_action_optimizer.zero_grad()
                    self.Mask_Critic_optimizer.zero_grad()
                    loss.backward()


                    self.Mask_action_optimizer.step()                            
                    self.Mask_Critic_optimizer.step()
                    #训练critic和action

                   
                    mask_reward0 = self.Mask_Critic_eval(sample_state[:,:,[1,2,3,5,6,7]], mask_perc_eval)
                    loss_pg = - torch.mean(mask_reward0)
                    #因为是梯度上升(policy gradient,所以这里有一个负号)

                    print('Loss_pg:'.format(loss_pg))
                    self.Mask_Actor_optimizer.zero_grad()
                    loss_pg.backward()
                    self.Mask_Actor_optimizer.step()
                    #训练actor

                    loss_record.append([loss.cpu().item(),loss_pg.cpu().item()])

                    #soft
                    #对target network进行软更新
                    #无论是action,actor还是critic,都需要进行软更新
                    #target=(1-soft_rate)target+soft_rate*eval
                    for x in self.Mask_action_target.state_dict().keys():
                        eval('self.Mask_action_target.'+x+'.data.mul_(1-self.soft_replace_rate)')
                        eval('self.Mask_action_target.'+x+'.data.add_(self.soft_replace_rate*self.Mask_action_eval.'+x+'.data)')
                    for x in self.Mask_Actor_target.state_dict().keys():
                        eval('self.Mask_Actor_target.'+x+'.data.mul_((1-self.soft_replace_rate))')
                        eval('self.Mask_Actor_target.'+x+'.data.add_(self.soft_replace_rate*self.Mask_Actor_eval.'+x+'.data)')
                    for x in self.Mask_Critic_target.state_dict().keys():
                        eval('self.Mask_Critic_target.'+x+'.data.mul_((1-self.soft_replace_rate))')
                        eval('self.Mask_Critic_target.'+x+'.data.add_(self.soft_replace_rate*self.Mask_Critic_eval.'+x+'.data)')

                    train_count += 1
                    print('Training:epoch /\\n'.format(train_count, self.train_steps))

                    if train_count == self.train_steps:
                        break

                #update the state
                self.current_state=self.next_state
                step_count+=1

        #save the models

        torch.save(self.Mask_action_eval.state_dict(), os.path.join('../model',self.city,'mask_action_model.pth'))

        torch.save(self.Mask_Actor_eval.state_dict(), os.path.join('../model',self.city,'mask_Actor_model.pth'))

        '''
        with open(os.path.join('../model',self.city,'loss.json'),'w') as f:
            json.dump(loss_record,f)
        '''
        print('Training complete!')

1.3 主函数部分

if __name__ == "__main__":
    os.chdir(os.path.split(os.path.realpath(__file__))[0])
    train_platform = RL_train(mask_total=1000000)
    #设置了总的口罩数
    train_platform.train()

2 Net/Qnet.py

action 部分 

计算显著性排序的DQN

import torch
from torch import nn,optim
import torch.nn.functional as F

class Qnet(nn.Module):
    def __init__(self):
        super(Qnet,self).__init__()
        self.cov1=nn.Conv1d(in_channels=6,
                            out_channels=16,
                            kernel_size=5,
                            stride=1,
                            padding=2,
                            bias=True)
        #这里池化层的输入维度就是我环境可以看到的状态的维度:6
        #分别表示:
        '''
        It(已经感染疾病,同时被检测出疾病的人。)
        Ih(被送医治疗的感染者)
        D(死亡人群)
        E(暴露人群)
        Iu(已经感染疾病,但是没有检测的人。)
        R(康复人群)
        顺序为:E,Iu,It,Ih,R,D
        
        '''
        self.cov2=nn.Conv1d(in_channels=16,
                            out_channels=32,
                            kernel_size=5,
                            stride=1,
                            padding=2,
                            bias=True)
        self.cov3=nn.Conv1d(in_channels=32,
                            out_channels=16,
                            kernel_size=5,
                            stride=2,
                            padding=0,
                            bias=True)
        self.cov4=nn.Conv1d(in_channels=16,
                            out_channels=4,
                            kernel_size=5,
                            stride=2,
                            padding=0,
                            bias=True)
        #四层卷积层 6->16->32->16->4
        self.lin1=nn.Linear(in_features=664,
                            out_features=128,
                            bias=True)
        self.lin2=nn.Linear(in_features=128,out_features=7,bias=True)
        self.device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.mask=torch.tensor([[0.01,0,0,0,0,0],
                                [0,0.05,0,0,0,0],
                                [0,0,0.01,0,0,0],
                                [0,0,0,0.05,0,0],
                                [0,0,0,0,0.01,0],
                                [0,0,0,0,0,0.05]],dtype=torch.float).to(self.device)

    def forward(self,x):
        #x是[1,region_num,6]
        #E,Iu,It,Ih,R,D

        x1=torch.matmul(x,self.mask).permute(0,2,1)
        #不同的状态乘以不同的加权系数,然后将维度变成[1,6,region_num]

        c1=F.leaky_relu(self.cov1(x1),0.2)
        #[1,16,region_num]

        c2=F.leaky_relu(self.cov2(c1),0.2)
        #[1,32,region_num]

        c3=F.leaky_relu(self.cov3(c2),0.2)
        #[1,16,region_num]

        c4=F.leaky_relu(self.cov4(c3),0.2)
        #[1,4,region_num]

        l1=F.leaky_relu(self.lin1(c4.view(x.shape[0],-1)),0.2)
        #[1,4,region_num]->[1,664]([1,4*161])->[1,128]

        l2 = self.lin2(l1)
        #->[1,7]
        #这里是表示不同的排序原则的“打分”
        #论文中一共出现了3中不同的排序原则,其中可以任意组合,所以有2^3-1=7种(不能都不选)

        return l2

3 Net/Anet.py

actor 部分

import torch
from torch import nn,optim
import torch.nn.functional as F

class Anet(nn.Module):
    def __init__(self):
        super(Anet,self).__init__()
        self.cov1=nn.Conv1d(in_channels=6,
                            out_channels=16,
                            kernel_size=5,
                            stride=1,
                            padding=2,
                            bias=True)
        #这里池化层的输入维度就是我环境可以看到的状态的维度:6
        #分别表示:
        '''
        It(已经感染疾病,同时被检测出疾病的人。)
        Ih(被送医治疗的感染者)
        D(死亡人群)
        E(暴露人群)
        Iu(已经感染疾病,但是没有检测的人。)
        R(康复人群)
        顺序为:E,Iu,It,Ih,R,D
        
        '''
        self.cov2=nn.Conv1d(in_channels=16,
                            out_channels=32,
                            kernel_size=5,
                            stride=1,
                            padding=2,
                            bias=True)
        self.cov3=nn.Conv1d(in_channels=32,
                            out_channels=16,
                            kernel_size=5,
                            stride=2,
                            padding=0,
                            bias=True)
        self.cov4=nn.Conv1d(in_channels=16,
                            out_channels=4,
                            kernel_size=5,
                            stride=2,
                            padding=0,
                            bias=True)
        #四层卷积层 6->16->32->16->4

        self.lin1=nn.Linear(in_features=664,out_features=128,bias=True)
        self.lin2=nn.Linear(in_features=128,out_features=16,bias=True)
        self.lin3=nn.Linear(in_features=16,out_features=1,bias=True)

        self.device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.mask=torch.tensor([[0.01,0,0,0,0,0],
                                [0,0.05,0,0,0,0],
                                [0,0,0.01,0,0,0],
                                [0,0,0,0.05,0,0],
                                [0,0,0,0,0.01,0],
                                [0,0,0,0,0,0.05]],dtype=torch.float).to(self.device)

    def forward(self,state):
        #x是[1,region_num,6]
        #E,Iu,It,Ih,R,D
        x1 = torch.matmul(state, self.mask).permute(0, 2, 1)
        #不同的状态乘以不同的加权系数,然后将维度变成[1,6,region_num]
        c1 = F.leaky_relu(self.cov1(x1), 0.2)
        #[1,16,region_num]
        c2 = F.leaky_relu(self.cov2(c1), 0.2)
        #[1,32,region_num]
        c3 = F.leaky_relu(self.cov3(c2), 0.2)
        #[1,16,region_num]
        c4 = F.leaky_relu(self.cov4(c3), 0.2)
        #[1,4,region_num]
        l1 = F.leaky_relu(self.lin1(c4.view(state.shape[0], -1)), 0.2)
        #[1,4,region_num]->[1,664]([1,4*161])->[1,128]
        l2 = F.leaky_relu(self.lin2(l1), 0.2)
        #->[1,16]
        l3 = torch.sigmoid(self.lin3(l2))
        #->[1,1]经过sigmoid之后,相当于是一个[0,1]之间的数

        return l3

4 Net/Cnet.py

critic 网络

import torch
from torch import nn,optim
import torch.nn.functional as F

class Cnet(nn.Module):
    def __init__(self,region_num):
        super(Cnet,self).__init__()
        self.lin_1 = nn.Linear(in_features=1,
                               out_features=128,
                               bias=True)
        self.lin_2 = nn.Linear(in_features=128,
                               out_features=256,
                               bias=True)
        self.lin_3 = nn.Linear(in_features=256,
                               out_features=region_num,
                               bias=True)


        self.cov1=nn.Conv1d(in_channels=7,
                            out_channels=16,
                            kernel_size=5,
                            stride=1,
                            padding=2,
                            bias=True)
        self.cov2=nn.Conv1d(in_channels=16,
                            out_channels=32,
                            kernel_size=5,
                            stride=1,
                            padding=2,
                            bias=True)
        self.cov3=nn.Conv1d(in_channels=32,
                            out_channels=16,
                            kernel_size=5,
                            stride=2,
                            padding=0,
                            bias=True)
        self.cov4=nn.Conv1d(in_channels=16,
                            out_channels=4,
                            kernel_size=5,
                            stride=2,
                            padding=0,
                            bias=True)

        self.lin1=nn.Linear(in_features=664,
                            out_features=128,
                            bias=True)
        self.lin2=nn.Linear(in_features=128,
                            out_features=16,
                            bias=True)
        self.lin3=nn.Linear(in_features=16,
                            out_features=1,
                            bias=True)

        self.device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.mask=torch.tensor([[0.01,0,0,0,0,0],
                                [0,0.05,0,0,0,0],
                                [0,0,0.01,0,0,0],
                                [0,0,0,0.05,0,0],
                                [0,0,0,0,0.01,0],
                                [0,0,0,0,0,0.05]],dtype=torch.float).to(self.device)

    def forward(self,state,perc):
        #state是当前各状态[batch_size,region_num,6]
        #perc是满足因子 [batch_size,1]

        a1 = F.leaky_relu(self.lin_1(perc), 0.2)
        #[batch_size,1]——>[batch_size,128]
        a2 = F.leaky_relu(self.lin_2(a1), 0.2)
        #[batch_size,128]——>[batch_size,256]
        a3 = F.leaky_relu(self.lin_3(a2), 0.2)
        #[batch_size,256]——>[batch_size,num_region]

        x1 = torch.cat((torch.matmul(state, self.mask), a3.unsqueeze(2)), dim=-1).permute(0, 2, 1)
        #在permute之前,[batch_size,region_num,7]
        #[batch_size,7,region_num]

        c1 = F.leaky_relu(self.cov1(x1), 0.2)
        #[batch_size,7,region_num]->[batch_size,16,region_num]
        c2 = F.leaky_relu(self.cov2(c1), 0.2)
        #[batch_size,16,region_num]->[batch_size,32,region_num]
        c3 = F.leaky_relu(self.cov3(c2), 0.2)
        #[batch_size,32,region_num]->[batch_size,16,region_num]
        c4 = F.leaky_relu(self.cov4(c3), 0.2)
        #[batch_size,16,region_num]->[batch_size,4,region_num]
        l1 = F.leaky_relu(self.lin1(c4.view(state.shape[0], -1)), 0.2)
        #c4.view(state.shape[0], -1):[batch_size,664]
        #[batch_size,664]->[batch_size,128]
        l2 = F.leaky_relu(self.lin2(l1), 0.2)
        #[batch_size,128]->[batch_size,16]
        l3 = self.lin3(l2)
        #[batch_size,16]->[batch_size,1]

        return l3

5 simulator.py

import numpy as np
import json
import random
import os
from node import node

5.1 __init__

class simulator(object):
    def __init__(self,city='city_sample'):
        super(simulator, self).__init__()
        self.city = city
        #需要考虑的城市名称

        #load data
        print('Loading dynamic pattern...')
        with open(os.path.join('../data',self.city,'prob.json'), 'r') as f:
            self.prob = np.array(json.load(f))
        #由于提供的代码里面没有这个文件,我猜测是各区域各状态转移比例

        print('Down!')
        print('Loading population data...')
        with open(os.path.join('../data',self.city,'flow.json'), 'r') as f:
            self.flow = np.array(json.load(f))
        #由于提供的代码里面没有这个文件,我猜测是各区域人流量

        with open(os.path.join('../data',self.city,'dense.json'), 'r') as f:
            self.dense = np.array(json.load(f))
        #由于提供的代码里面没有这个文件,我猜测是各区域人口密度

        with open(os.path.join('../data',self.city,'pop_region.json'), 'r') as f:
            self.pop = np.array(json.load(f))
        ##由于提供的代码里面没有这个文件,我猜测是各区域最佳分配的数量(上限)
        print('Down!')

        self.START = 1
        self.region_num = len(self.flow)
        #区域数

        self.nodes = list()
        self.full = False

        self.parameters = [[1, 0, 0], [0, 1, 0], [0, 0, 1], [1, 1, 0], [1, 0, 1], [0, 1, 1], [1, 1, 1]]
        #三种排序原则,至少选择一种,不同的排序方式(1表示选,0表示不选)

5.2 reset

def reset(self, state):
        # state是一个region_num*8的二维数组
        #初始状态下各区域各状态的人数
        
        print('\\nReseting...')
        self.nodes = list()
        for i in range(self.region_num):
            self.nodes.append(node(i))
            #初始化id为i的点,初始化的时候8状态为0
            self.nodes[i].set_susceptible(state[i][0])
            #设置i点易感人群数量(S)
            self.nodes[i].set_latent(state[i][1])
            #设置i点潜伏人群数量(E)
            self.nodes[i].set_infected_ut(state[i][2])
            #设置i点已经感染,但是没有检测的人的数量(Iu)
            self.nodes[i].set_infected_t(state[i][3])
            #设置i点已经感染疾病,但是同时被检测出疾病的人的数量(It)
            self.nodes[i].set_infected_asymptomatic(state[i][4])
            #设置i点已经感染疾病,但是没有测出阳性的人(无症状感染者)的数量(Ia)
            self.nodes[i].set_in_hospital(state[i][5])
            #设置i点已经感染疾病,且送医治疗的人(Ih)
            self.nodes[i].set_recovered(state[i][6])
            #设置i点康复人群的数量(R)
            self.nodes[i].set_death(state[i][7])
            #设置i点死亡人数的数量
        print('Down!')

5.3 simulate

各状态转移量模拟

def simulate(self,
                 sim_type,                       #分配口罩的方案
                 interval,                       #间隔多久更新一次
                 mask_on=False,                  #是否戴mask
                 mask_total=0,                   #整体口罩数
                 mask_quality=0,
                 mask_lasting=48,                #口罩有效期
                 mask_action=0,            
                 mask_distribute_perc=[0],        #口罩满足系数
                 bed_satisfy_perc=1,
                 mask_satisfy_perc=1,
                 it_move=False,
                 is_save=False,
                 save_time=0):

        print('Simulating...')
        #temp variables
        S_temp = np.zeros((self.region_num, self.region_num + 1))
        L_temp = np.zeros((self.region_num, self.region_num + 1))
        Iut_temp = np.zeros((self.region_num, self.region_num + 1))
        if it_move:
            It_temp = np.zeros((self.region_num, self.region_num + 1))
        Ia_temp = np.zeros((self.region_num, self.region_num + 1))
        R_temp = np.zeros((self.region_num, self.region_num + 1))
        #不同的各状态之间region之间的移动比例

        Ih_new = 0
        for time in range(interval):
            #mask
            if (time % mask_lasting == 0):
                #如果口罩有效期过了,那么就要重新分配口罩

                mask_numbers = np.zeros(self.region_num)
                #各区域口罩分配情况

                if mask_on:
                    mask_num = mask_total

                    if sim_type == 'Mean':
                        mask_numbers = np.ones(self.region_num) * int(mask_num / self.region_num)
                    #如果选择平均分配的话,那么每个区域均分口罩数量


  
    
                    elif sim_type == 'Lottery':
                        order = np.array(range(self.region_num))
                        np.random.shuffle(order)
                        #随机排序区域id,按照这个排序从前到后分配口罩(排在前面的尽可能分配满)
                        for i in range(self.region_num):
                            if mask_num > 0:
                                mask_numbers[order[i]] = min(mask_num, self.pop[order[i]])
                                #self.pop[order[i]]——该区域需要分配的数量
                                mask_num = mask_num - mask_numbers[order[i]]
                            else:
                                break




                    else:
                        patient_num = np.zeros(self.region_num)
                        patient_num_order = np.zeros(self.region_num)

                        for i in range(self.region_num):
                            patient_num[i] = self.nodes[i].infected_t
                            #已经感染疾病,但是同时被检测出疾病的人的数量(It)
                            patient_num_order[i] = self.nodes[i].infected_t + self.nodes[i].in_hospital + self.nodes[i].recovered + self.nodes[i].death
                            '''
                            已经感染疾病,但是同时被检测出疾病的人的数量(It)
                            已经感染疾病,且送医治疗的人(Ih)
                            康复人群的数量(R)
                            死亡人数的数量(D)

                            这四个的和,也即累计感染人数
                            '''
                            

                        if sim_type == 'Max_first':
                            order = np.argsort(-patient_num_order)
                            #根据累积感染人数排序(累计感染越多的越先)
                            for i in range(self.region_num):
                                if mask_num > 0:
                                    mask_numbers[order[i]] = min(mask_num, self.pop[order[i]])
                                    #self.pop[order[i]]——该区域需要分配的数量
                                    mask_num = mask_num - mask_numbers[order[i]]
                                else:
                                    break
                            #排序后,前面的区域要尽量满足




                        elif sim_type == 'Min_first':
                            order = np.argsort(patient_num_order)
                            ##根据累积感染人数排序(累计感染越少的越先)
                            for i in range(self.region_num):
                                if mask_num > 0:
                                    mask_numbers[order[i]] = min(mask_num, self.pop[order[i]])
                                    mask_num = mask_num - mask_numbers[order[i]]
                                else:
                                    break
                            #排序后,前面的区域要尽量满足


                        elif sim_type == 'Policy':
                            mask_parameter = self.parameters[mask_action]
                            #mask_parameter 返回的是一个0~7的数,对应的就是我们选择哪种排序方式
                            infect_num = patient_num_order/np.mean(patient_num_order)
                            order = np.argsort(-(mask_parameter[0]*infect_num
                                                 + mask_parameter[1]*self.flow
                                                 + mask_parameter[2]*self.dense))
                            #根据我们选择的排序方式,决定我们是否需要感染人数、人口密度、区域人口流动强度
                            alloc_mask = np.floor(mask_num*np.array(mask_distribute_perc))
                            #每个区域可以分配到的口罩数量(bed_distribute_perc就是不同区域床位的满足因子)
                            
                            for i in range(len(mask_distribute_perc)):
                                mask_numbers[order[i]] = min(self.pop[order[i]], alloc_mask[i])
                                mask_num -= mask_numbers[order[i]]
                            #每个区域分配口罩的数量

                            if mask_num > 0:
                                #如果还有口罩,那么按照各区域满足因子从大到小的顺序排序,前面的优先满足
                                temp_order = np.argsort(-np.array(mask_distribute_perc))
                                for i in range(len(mask_distribute_perc)):
                                    temp=min(mask_num,
                                             self.pop[order[temp_order[i]]]-mask_numbers[order[temp_order[i]]])
                                    #排序在前面的组,按照可以满足的口罩书填满
                                    mask_num = mask_num - temp
                                    mask_numbers[order[temp_order[i]]] += temp
                                    if mask_num == 0:
                                        break

                                while mask_num > 0:
                                    index = np.random.randint(0, self.region_num)
                                    if mask_numbers[index] < self.pop[index]:
                                        mask_numbers[index] += 1
                                        mask_num -= 1
                                    else:
                                        break
                                #如果还有多的口罩,那么就随机分配

                        elif sim_type == 'Policy_a':
                            mask_parameter = self.parameters[mask_action]
                            #mask_parameter 返回的是一个0~7的数,对应的就是我们选择哪种排序方式
                            infect_num = patient_num_order/np.mean(patient_num_order)
                            order = np.argsort(-(mask_parameter[0] * infect_num
                                                 + mask_parameter[1] * self.flow
                                                 + mask_parameter[2] * self.dense))
                            #根据我们选择的排序方式,决定我们是否需要感染人数、人口密度、区域人口流动强度
                            
                            for i in range(self.region_num):
                                if mask_num > 0:
                                    mask_numbers[order[i]] = min(mask_num, int(self.pop[order[i]]*mask_satisfy_perc))
                                    mask_num = mask_num - mask_numbers[order[i]]
                                else:
                                    break
                            #由于此时各组的满足因子是一样的,所以我们根据排序,从多到少分配口罩(各区域按照上限分配)

            #state transition
            for i in range(self.region_num):
                self.nodes[i].step(mask_numbers[i], mask_quality)
            #每一个点,更新各个状态的值

            

            #cross region traveling  空间上的迁移
            for k in range(self.region_num):
                S_temp[k] = np.random.multinomial(
                    self.nodes[k].susceptible, self.prob[((self.START-1)*48+time) % (7*48)][k])
                #该区域k有多少状态为S的人会到其他区域去
                L_temp[k] = np.random.multinomial(
                    self.nodes[k].latent, self.prob[((self.START-1)*48+time) % (7*48)][k])
                #该区域k有多少状态为E的人会到其他区域去
                Iut_temp[k] = np.random.multinomial(
                    self.nodes[k].infected_ut, self.prob[((self.START - 1)* 48 + time) % (7* 48)][k])
                #该区域k有多少状态为Iu的人会到其他区域去
                #已经感染疾病,但是没有检测的人。这些人的出行不受任何限制
                if it_move:
                    It_temp[k] = np.random.multinomial(
                        self.nodes[k].infected_t, self.prob[((self.START-1)*48+time) % (7*48)][k])
                #该区域k有多少状态为It的人会到其他区域去
                #已经感染疾病,同时被检测出疾病的人。这些人的出行被限制在某一个区域内
                Ia_temp[k] = np.random.multinomial(
                    self.nodes[k].infected_asymptomatic, self.prob[((self.START - 1)* 48 + time) % (7* 48)][k])
                #该区域k有多少状态为Ia的人会到其他区域去
                #已经感染疾病,但是疾病监测没有检测出阳性的人
                R_temp[k] = np.random.multinomial(
                    self.nodes[k].recovered, self.prob[((self.START - 1)* 48 + time) % (7* 48)][k])
                #该区域k有多少状态为R的人会到其他区域去
                
            S_temp_sum0 = np.sum(S_temp, axis=0)
            L_temp_sum0 = np.sum(L_temp, axis=0)
            Iut_temp_sum0 = np.sum(Iut_temp, axis=0)
            if it_move:
                It_temp_sum0 = np.sum(It_temp, axis=0)
            Ia_temp_sum0 = np.sum(Ia_temp, axis=0)
            R_temp_sum0 = np.sum(R_temp, axis=0)
            #别的区域过来的人数
            
            S_temp_sum1 = np.sum(S_temp, axis=1)
            L_temp_sum1 = np.sum(L_temp, axis=1)
            Iut_temp_sum1 = np.sum(Iut_temp, axis=1)
            if it_move:
                It_temp_sum1 = np.sum(It_temp, axis=1)
            Ia_temp_sum1 = np.sum(Ia_temp, axis=1)
            R_temp_sum1 = np.sum(R_temp, axis=1)
            #到别的区域去的人数

            for k in range(self.region_num):
                self.nodes[k].set_susceptible(
                    self.nodes[k].susceptible
                    +S_temp_sum0[k]
                    -S_temp_sum1[k]
                    +S_temp[k][self.region_num])
                #每个区域S状态的变化:原先的+过来的-到别的地方去的
                #过来的和到别的地方去的,各用了一次S_temp[k][self.region_num](自己到自己的),正好抵消了
                #所以需要再加一次
                self.nodes[k].set_latent(
                    self.nodes[k].latent
                    +L_temp_sum0[k]
                    -L_temp_sum1[k]
                    +L_temp[k][self.region_num])
                #每个区域E状态的变化:原先的+过来的-到别的地方去的
                #过来的和到别的地方去的,各用了一次S_temp[k][self.region_num](自己到自己的),正好抵消了
                #所以需要再加一次
                self.nodes[k].set_infected_ut(
                    self.nodes[k].infected_ut
                    + Iut_temp_sum0[k]
                    - Iut_temp_sum1[k]
                    + Iut_temp[k][self.region_num])
                #每个区域Iut状态的变化:原先的+过来的-到别的地方去的
                #过来的和到别的地方去的,各用了一次S_temp[k][self.region_num](自己到自己的),正好抵消了
                #所以需要再加一次
                if it_move:
                    self.nodes[k].set_infected_t(
                        self.nodes[k].infected_t
                        +It_temp_sum0[k]
                        -It_temp_sum1[k]
                        +It_temp[k][self.region_num])
                #每个区域It状态的变化:原先的+过来的-到别的地方去的
                #过来的和到别的地方去的,各用了一次S_temp[k][self.region_num](自己到自己的),正好抵消了
                #所以需要再加一次
                self.nodes[k].set_infected_asymptomatic(
                    self.nodes[k].infected_asymptomatic
                    +Ia_temp_sum0[k]
                    -Ia_temp_sum1[k]
                    +Ia_temp[k][self.region_num])
                #每个区域Ia状态的变化:原先的+过来的-到别的地方去的
                #过来的和到别的地方去的,各用了一次S_temp[k][self.region_num](自己到自己的),正好抵消了
                #所以需要再加一次
                self.nodes[k].set_recovered(
                    self.nodes[k].recovered
                    + R_temp_sum0[k]
                    - R_temp_sum1[k]
                    + R_temp[k][self.region_num])
                #每个区域R状态的变化:原先的+过来的-到别的地方去的
                #过来的和到别的地方去的,各用了一次S_temp[k][self.region_num](自己到自己的),正好抵消了
                #所以需要再加一次

            if(is_save):
                save = list()
                for i in range(self.region_num):
                    temp1 = [self.nodes[i].susceptible, self.nodes[i].latent, self.nodes[i].infected_ut, self.nodes[i].infected_t, self.nodes[i].infected_asymptomatic, self.nodes[i].in_hospital, self.nodes[i].recovered, self.nodes[i].death]
                    save.append(temp1)
                save = np.array(save)
                save = save.astype(np.float)
                with open(os.path.join('../result',self.city,'result_'+str(save_time*interval+time)+'.json'), 'w') as f:
                    json.dump(save.tolist(), f)

        next_state = list()
        for i in range(self.region_num):
            next_state.append([self.nodes[i].susceptible,
                               self.nodes[i].latent,
                               self.nodes[i].infected_ut,
                               self.nodes[i].infected_t,
                               self.nodes[i].infected_asymptomatic,
                               self.nodes[i].in_hospital,
                               self.nodes[i].recovered,
                               self.nodes[i].death])
            #每个区域的状态

        a = self.statistic()
        print('S:,L:,Iut:,It:,Ia:,Ih:,R:,D:'.format(a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7]))
        print('Is Full:'.format(self.full))

        return np.array(next_state), Ih_new

6 node.py

import numpy as np
Pa = 0.018  # false negative rate 
eps_1 = 250  # average length of incubation (30min)
eps_1_d = 5.2  # average length of incubation (day)
d = 0.15  # death rate
t = 672  # average recovery time (30min)
t_d = 14  # average recovery time (day)
d_hospital = 0.04  # death rate in hospital
t_hospital = 614  # average recovery time in hospital(30min)
t_hospital_d = 12.8  # average recovery time in hospital(day)
R_0 = 2.68  # basic reproduction number
r_a = 0.6
r_L = 1.0
eps = 1/eps_1
beta = R_0 / (r_L * eps_1_d + (Pa * r_a + (1 - Pa)) * t_d)
beta = np.power(1+beta, 1/48)-1
L_I = eps * (1 - Pa)
L_Ia = eps * Pa
I_D = d / t
I_R = ((1 - d)/(t_d - d))/48

theta = 48 # testing speed (30min)
I_h = 1/theta
Ia_R = 1 / t
Ih_D = d_hospital / t_hospital
Ih_R = ((1 - d_hospital) / (t_hospital_d - d_hospital))/48

class node:
    def __init__(self, id):
        self.id = id
        self.susceptible = 0
        self.infected_ut = 0
        self.infected_t = 0
        self.death = 0
        self.in_hospital = 0
        self.infected_asymptomatic = 0
        self.recovered = 0
        self.latent = 0

    def set_susceptible(self, susceptible):
        self.susceptible = susceptible

    def set_latent(self, latent):
        self.latent = latent

    def set_infected_ut(self, infected_ut):
        self.infected_ut = infected_ut

    def set_infected_t(self, infected_t):
        self.infected_t = infected_t

    def set_infected_asymptomatic(self, infected_asymptomatic):
        self.infected_asymptomatic = infected_asymptomatic

    def set_death(self, death):
        self.death = death

    def set_in_hospital(self, in_hospital):
        self.in_hospital = in_hospital

    def set_recovered(self, recovered):
        self.recovered = recovered

    def step(self,mask_number,mask_quality):
        if (self.susceptible + self.latent + self.infected_ut + self.infected_t + self.infected_asymptomatic + self.in_hospital + self.recovered > 0):
            #除去了死亡状态,其他状态人数和大于0(换句话说,就是人没有全死光)
            mask_factor = 1 - np.clip(mask_number / (self.susceptible +
                                                     self.latent +
                                                     self.infected_ut +
                                                     self.infected_t +
                                                     self.infected_asymptomatic +
                                                     self.in_hospital +
                                                     self.recovered),
                                      0, 1) * mask_quality
            #除的这一串相当于是这个node目前有的口罩数/目前存活的人
            #也即论文中说的口罩覆盖率Π(t)
            #也就是这一串是论文中的(1-Π(t)γ)
            lambda_j = ((self.infected_ut +
                         self.infected_t +
                         self.infected_asymptomatic * r_a +
                         self.latent * r_L) /
                        (self.susceptible +
                         self.latent +
                         self.infected_ut + s
                         elf.infected_t +
                         self.infected_asymptomatic +
                         self.in_hospital +
                         self.recovered)) * beta * mask_factor
            #前面除的那一串是该node感染人数/该node总人数 * β * (1-Π(t)γ)
            susceptible_to_latent, __ = np.random.multinomial(self.susceptible, [lambda_j, 1])
            #相当于是从S 状态到E 状态的人数
            #我们原先一共有self.susceptible个S状态的人,每个人都有lambda_j的概率变成E状态
            #于是我们就用多次二项分布的方法,判断每一个个体是否会变成E状态
            self.susceptible -= susceptible_to_latent
            self.latent += susceptible_to_latent
            #S状态增加susceptible_to_latent个人,E状态减少susceptible_to_latent个人

            latent_to_infected, latent_to_Ia, __ = np.random.multinomial(self.latent, [L_I, L_Ia, 1])
            self.infected_ut += latent_to_infected
            self.infected_asymptomatic += latent_to_Ia
            self.latent -= (latent_to_Ia + latent_to_infected)
            #从E转换到Iu和Ia的人数

            prob = I_h

            infected_ut_to_t, __ = np.random.multinomial(self.infected_ut, [prob, 1])
            self.infected_ut -= infected_ut_to_t
            self.infected_t += infected_ut_to_t
            #从Iu转换到Ia的人数

            infected_to_death, infected_to_recovered, __ = np.random.multinomial(self.infected_ut, [I_D, I_R, 1])
            self.death += infected_to_death
            self.recovered += infected_to_recovered
            self.infected_ut -= (infected_to_death + infected_to_recovered)
            #从Iu状态(已经感染疾病,但是没有检测的人)到死亡/康复的人
            
            infected_to_death, infected_to_recovered, __ = np.random.multinomial(self.infected_t, [I_D, I_R, 1])
            self.death += infected_to_death
            self.recovered += infected_to_recovered
            self.infected_t -= (infected_to_death + infected_to_recovered)
            #从It状态(已经感染疾病,同时被检测出疾病的人)到死亡/康复的人

            Ia_to_recovered, __ = np.random.multinomial(self.infected_asymptomatic, [Ia_R, 1])
            self.recovered += Ia_to_recovered
            self.infected_asymptomatic -= Ia_to_recovered
            #从Ia状态(已经感染疾病,但是疾病监测没有检测出阳性的人)【无症状感染者】到康复的人数

            in_hospital_to_death, in_hospital_to_recovered, __ = np.random.multinomial(self.in_hospital, [Ih_D, Ih_R, 1])
            self.death += in_hospital_to_death
            self.recovered += in_hospital_to_recovered
            self.in_hospital -= (in_hospital_to_death + in_hospital_to_recovered)
            #从Ih状态(送医治疗的人)到康复/死亡的人

以上是关于论文代码解读 Hierarchical Reinforcement Learning for Scarce Medical Resource Allocation的主要内容,如果未能解决你的问题,请参考以下文章

论文解析[9] Swin Transformer: Hierarchical Vision Transformer using Shifted Windows

论文笔记:FeUdal Networks for Hierarchical Reinforcement Learning

论文笔记 Hierarchical Reinforcement Learning for Scarce Medical Resource Allocation

[论文阅读笔记] HARP Hierarchical Representation Learning for Networks

论文笔记之: Hierarchical Convolutional Features for Visual Tracking

论文阅读-Hierarchical Cross-Modal Talking Face Generation with Dynamic Pixel-Wise Loss