CoAP/boucle.py

import datetime
import os
import socket
import threading
import time
import queue
import numpy as np
import tensorflow as tf
from tf_agents.agents.reinforce import reinforce_agent
from tf_agents.environments import py_environment, tf_py_environment
from tf_agents.networks import actor_distribution_network
from tf_agents.policies import policy_saver
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.trajectories import trajectory
from tf_agents.utils import common

from coapthon.client.helperclient import HelperClient
from coapthon.client.superviseur import (SuperviseurGlobal,
                                         SuperviseurLocalFiltre)
from coapthon.utils import parse_uri
from utils_learning import MaquetteCoapEnv

fc_layer_params = (30,)
replay_buffer_capacity = 1500
learning_rate = 0.05

n_capteur = 25
n_superviseur = 6

tempdir = "save_run_{}-{}-{}".format(datetime.datetime.now().date(
), datetime.datetime.now().hour, datetime.datetime.now().minute)

host, port, path = parse_uri("coap://raspberrypi.local/basic")
try:
    tmp = socket.gethostbyname(host)
    host = tmp
except socket.gaierror:
    pass

eval_env_py = MaquetteCoapEnv([HelperClient(server=(host, port)) for _ in range(n_capteur)],
                              SuperviseurLocalFiltre,
                              SuperviseurGlobal,
                              path)
maquettes_py = [MaquetteCoapEnv([HelperClient(server=(host, port)) for _ in range(n_capteur)],
                                SuperviseurLocalFiltre,
                                SuperviseurGlobal,
                                path)
                for _ in range(n_superviseur)]

maquettes = [tf_py_environment.TFPyEnvironment(
    maquette) for maquette in maquettes_py]
eval_env = tf_py_environment.TFPyEnvironment(eval_env_py)

actor_net = actor_distribution_network.ActorDistributionNetwork(
    eval_env.observation_spec(),
    eval_env.action_spec(),
    fc_layer_params=fc_layer_params)

# optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate)
global_step = tf.compat.v1.train.get_or_create_global_step()

train_step_counter = tf.compat.v2.Variable(0)

tf_agent = reinforce_agent.ReinforceAgent(
    eval_env.time_step_spec(),
    eval_env.action_spec(),
    actor_network=actor_net,
    optimizer=optimizer,
    normalize_returns=True,
    train_step_counter=train_step_counter)
tf_agent.initialize()

collect_policy = tf_agent.collect_policy  # Avec exploration
eval_policy = tf_agent.policy  # Sans exploration

replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    data_spec=tf_agent.collect_data_spec,
    batch_size=eval_env.batch_size,
    max_length=replay_buffer_capacity)

buffer_lock = threading.Lock()


def collecteur(maquette, policy):
    # queue_commande = queue.Queue()
    time_step = maquette.step(np.array(n_capteur*[0], dtype=np.float32))
    while True:
        # if queue_commande.empty():
        #     pass
        # commande = queue.get()

        action_step = policy.action(time_step)
        next_time_step = maquette.step(action_step.action)
        traj = trajectory.from_transition(
            time_step, action_step, next_time_step)
        with buffer_lock:
            replay_buffer.add_batch(traj)
        time_step = next_time_step
        if traj.is_boundary():
            maquette.reset()


checkpoint_dir = os.path.join(tempdir, 'checkpoint')
train_checkpointer = common.Checkpointer(
    ckpt_dir=checkpoint_dir,
    max_to_keep=1,
    agent=tf_agent,
    policy=tf_agent.policy,
    replay_buffer=replay_buffer,
    global_step=global_step
)
policy_dir = os.path.join(tempdir, 'policy')
tf_policy_saver = policy_saver.PolicySaver(tf_agent.policy)

threads_collecteur = [threading.Thread(target=collecteur,
                                       name="Collecteur {}".format(n),
                                       args=(maquette, collect_policy))
                      for n, maquette in enumerate(maquettes)]

[thread.start() for thread in threads_collecteur]

while True:
    time.sleep(60)
    with buffer_lock:
        if replay_buffer.num_frames() >= 200:
            experience = replay_buffer.gather_all()
            experience = replay_buffer.gather_all()
            train_loss = tf_agent.train(experience)
            replay_buffer.clear()
            tf_policy_saver.save(policy_dir)
        print("thread : {}\tBuffer :{}".format(
            threading.active_count(), replay_buffer.num_frames()))
        try:
            train_checkpointer.save(global_step)
        except Exception:
            pass