app: refactor the Connection to make it asynchronous when connecting

Some work has been done on making the Connection feel nicer, but also
more work is needed to not have the channel be exposed to the upper
layers of the application. We should wrap all the GRPC calls in the
GenServer (which may also allow caching on certain calls such as
get_sys_info)
This commit is contained in:
Nikos Papadakis 2023-08-28 23:32:42 +03:00
parent be7f584010
commit 26ba60b95d
Signed by untrusted user who does not match committer: nikos
GPG key ID: 78871F9905ADFF02
9 changed files with 231 additions and 131 deletions

View file

@ -46,7 +46,8 @@ impl agent_server::Agent for AgentService {
sys.refresh_specifics( sys.refresh_specifics(
sysinfo::RefreshKind::new() sysinfo::RefreshKind::new()
.with_disks() // .with_disks() // what is this?
.with_disks_list()
.with_memory() .with_memory()
.with_processes(sysinfo::ProcessRefreshKind::everything()) .with_processes(sysinfo::ProcessRefreshKind::everything())
.with_cpu(sysinfo::CpuRefreshKind::everything()), .with_cpu(sysinfo::CpuRefreshKind::everything()),

View file

@ -7,42 +7,51 @@ defmodule Prymn.Agents do
""" """
@doc """ alias Prymn.Agents.Connection
Establishes a dynamically supervised, background connection with the target alias Prymn.Agents.Health
host agent, keeping it alive if it is already started.
""" def start_connection(host_address) do
@spec start_connection_or_keep_alive(String.t()) :: :ok spec = {Connection, host_address}
def start_connection_or_keep_alive(host_address) do
spec = {Prymn.Agents.Connection, host_address}
case DynamicSupervisor.start_child(Prymn.Agents.ConnectionSupervisor, spec) do case DynamicSupervisor.start_child(Prymn.Agents.ConnectionSupervisor, spec) do
{:error, {:already_started, pid}} -> {:ok, _pid} -> :ok
:ok = Prymn.Agents.Connection.keep_alive(pid) {:error, {:already_started, _pid}} -> :ok
:ok {:error, error} -> {:error, error}
{:ok, _pid} ->
:ok
end end
end end
@doc """ @doc """
Subscribe to the host's Health using Phoenix.PubSub Broadcasted messages are Subscribe to the host's Health using Phoenix.PubSub Broadcasted messages are
in the form of: the Health struct:
{host_address, %Prymn.Agents.Health{}} %Prymn.Agents.Health{}
## Returns
This function returns the last health status of the Agent that was saved.
""" """
@spec subscribe_to_health(String.t()) :: Prymn.Agents.Health.t()
def subscribe_to_health(host_address) do def subscribe_to_health(host_address) do
Prymn.Agents.Health.subscribe(host_address) :ok = Health.subscribe(host_address)
Prymn.Agents.Health.lookup(host_address)
end end
@spec get_channel(String.t()) :: GRPC.Channel.t() @doc """
Return the last known health status of the Agent, or `nil` if it doesn't
exist.
"""
def get_health(host_address) do
Health.lookup(host_address)
end
# TODO: We should not expose this api, instead wrap every GRPC call in this
# module GRPC is an "internal implementation detail" (although it probably
# wont ever change)
#
# E.g.
# def get_sys_info(agent) do
# PrymnProto.Prymn.Agent.Stub.get_sys_info(agent.channel, %Google.Protobuf.Empty{})
# end
def get_channel(host_address) do def get_channel(host_address) do
Prymn.Agents.Connection.get_channel(host_address) with [{pid, _}] <- Registry.lookup(Prymn.Agents.Registry, host_address),
channel when channel != nil <- Connection.get_channel(pid) do
{:ok, channel}
else
_ -> {:error, :not_found}
end
end end
end end

View file

@ -1,6 +1,7 @@
defmodule Prymn.Agents.Connection do defmodule Prymn.Agents.Connection do
@moduledoc false @moduledoc false
alias Prymn.Agents.Health
require Logger require Logger
use GenServer, restart: :transient use GenServer, restart: :transient
@ -8,39 +9,36 @@ defmodule Prymn.Agents.Connection do
@timeout :timer.minutes(2) @timeout :timer.minutes(2)
@spec start_link(String.t()) :: GenServer.on_start() @spec start_link(String.t()) :: GenServer.on_start()
def start_link(addr) when is_binary(addr) do def start_link(host_address) do
GenServer.start_link(__MODULE__, addr, name: via(addr)) GenServer.start_link(__MODULE__, host_address, name: via(host_address))
end end
@spec keep_alive(pid | String.t()) :: :ok @spec get_channel(pid) :: GRPC.Channel.t() | nil
def keep_alive(server) when is_pid(server), do: GenServer.cast(server, :keep_alive)
def keep_alive(server) when is_binary(server), do: GenServer.cast(via(server), :keep_alive)
@spec stop(pid | String.t()) :: :ok
def stop(server) when is_pid(server), do: GenServer.stop(server, :shutdown)
def stop(server) when is_binary(server), do: GenServer.stop(via(server), :shutdown)
@spec get_channel(String.t()) :: GRPC.Channel.t()
def get_channel(server) do def get_channel(server) do
GenServer.call(via(server), :get_channel) GenServer.call(server, :get_channel)
end end
##
## Server callbacks
##
@impl true @impl true
def init(host) do def init(host) do
Process.flag(:trap_exit, true) # Process.flag(:trap_exit, true)
{:ok, {host, nil}, {:continue, :connect}} pid = self()
# Start a connection without blocking the GenServer
Task.start_link(fn ->
case GRPC.Stub.connect(host, 50012, []) do
{:ok, channel} -> send(pid, channel)
{:error, error} -> send(pid, {:connect_error, error})
end end
@impl true # Keep receiving and sending back any messages to the GenServer forever
def handle_continue(:connect, {host_address, _} = state) do receive_loop(pid)
case GRPC.Stub.connect(host_address, 50012, []) do end)
{:ok, channel} ->
keep_alive(self())
{:noreply, {host_address, channel}, {:continue, :health}}
{:error, error} -> {:ok, {host, nil}}
{:stop, error, state}
end
end end
@impl true @impl true
@ -56,7 +54,12 @@ defmodule Prymn.Agents.Connection do
|> Enum.take_while(fn _ -> true end) |> Enum.take_while(fn _ -> true end)
end) end)
{:noreply, state, @timeout} {:noreply, state}
end
@impl true
def handle_cast(_, state) do
{:noreply, state}
end end
@impl true @impl true
@ -65,37 +68,44 @@ defmodule Prymn.Agents.Connection do
end end
@impl true @impl true
def handle_cast(:keep_alive, state) do def handle_info(%GRPC.Channel{} = channel, {host, _}) do
{:noreply, state, @timeout} {:noreply, {host, channel}, {:continue, :health}}
end
def handle_info({:connect_error, reason}, {host, _} = state) do
if reason == :timeout do
Health.lookup(host, default: true)
|> Health.make_timed_out()
|> Health.update_and_broadcast()
end
{:stop, reason, state}
end end
@impl true
def handle_info(%PrymnProto.Prymn.HealthResponse{} = response, {host, _} = state) do def handle_info(%PrymnProto.Prymn.HealthResponse{} = response, {host, _} = state) do
response response
|> Prymn.Agents.Health.new_from_proto() |> Health.make_from_proto(host)
|> Prymn.Agents.Health.update(host) |> Health.update_and_broadcast()
{:noreply, state, @timeout} {:noreply, state, @timeout}
end end
def handle_info(%GRPC.RPCError{} = response, {host, _} = state) do def handle_info(%GRPC.RPCError{} = response, state) do
Logger.debug("received a GRPC error: #{inspect(response)}") Logger.debug("received a GRPC error: #{inspect(response)}")
{:noreply, state}
response
|> Prymn.Agents.Health.new_from_proto()
|> Prymn.Agents.Health.update(host)
{:noreply, state, @timeout}
end end
def handle_info({:gun_up, _pid, _protocol}, state) do def handle_info({:gun_up, _pid, _protocol}, state) do
# TODO: If it's possible for the GRPC connection to be down when we receive # NOTE: If it's possible for the GRPC connection to be down when we receive
# this message, we should `{:continue, state.channel.host}` # this message, maybe we should restart the connection
{:noreply, state, {:continue, :health}} {:noreply, state, {:continue, :health}}
end end
def handle_info({:gun_down, _pid, _proto, _reason, _}, {host, _} = state) do def handle_info({:gun_down, _pid, _proto, _reason, _}, {host, _} = state) do
Logger.debug("disconnected from #{host}") Health.lookup(host)
|> Health.make_disconnected()
|> Health.update_and_broadcast()
{:noreply, state, @timeout} {:noreply, state, @timeout}
end end
@ -103,24 +113,27 @@ defmodule Prymn.Agents.Connection do
{:stop, {:shutdown, :timeout}, state} {:stop, {:shutdown, :timeout}, state}
end end
def handle_info({:EXIT, _from, _reason}, state) do
# TODO: How to handle this... (this happens when a linked process exists
# e.g the one with the health stream)
{:noreply, state, @timeout}
end
def handle_info(msg, state) do def handle_info(msg, state) do
Logger.warning("received unexpected message: #{inspect(msg)}") Logger.debug("received unhandled message #{inspect(msg)}")
{:noreply, state, @timeout} {:noreply, state}
end end
@impl true @impl true
def terminate(reason, {host, channel}) do def terminate(reason, {host, channel}) do
Logger.debug("terminating Agent connection (host: #{host}, reason: #{inspect(reason)})") Logger.debug("terminating Agent connection (host: #{host}, reason: #{inspect(reason)})")
Health.delete(host)
if channel, do: GRPC.Stub.disconnect(channel) if channel, do: GRPC.Stub.disconnect(channel)
end end
defp via(name) do defp via(name) do
{:via, Registry, {Prymn.Agents.Registry, name}} {:via, Registry, {Prymn.Agents.Registry, name}}
end end
defp receive_loop(pid) do
receive do
msg -> send(pid, msg)
end
receive_loop(pid)
end
end end

View file

@ -1,66 +1,82 @@
defmodule Prymn.Agents.Health do defmodule Prymn.Agents.Health do
@doc """ @moduledoc """
The Health struct keeps simple health information of The Health struct keeps simple health information of whether or not the
whether or not the target host machine is up to date, has any tasks running, target host machine is up to date, has any tasks running, its resources are
its resources are getting depleted, or if it's unable be reached. getting depleted, or if it's unable be reached.
""" """
defstruct [:version, message: "Unknown"] defstruct [:host, :version, message: "Unknown"]
alias PrymnProto.Prymn.HealthResponse alias PrymnProto.Prymn.HealthResponse
@type t :: %{ @type t :: %{
host: String.t(),
version: String.t(), version: String.t(),
message: String.t() message: String.t()
} }
@doc false
def start() do def start() do
:ets.new(__MODULE__, [:set, :public, :named_table, read_concurrency: true]) :ets.new(__MODULE__, [:set, :public, :named_table, read_concurrency: true])
end end
@doc false def subscribe(host) do
def subscribe(host_address) do Phoenix.PubSub.subscribe(Prymn.PubSub, "health:#{host}")
Phoenix.PubSub.subscribe(Prymn.PubSub, "health:#{host_address}")
end end
@doc false def broadcast!(%__MODULE__{host: host} = health) do
def update(health, host_address) do Phoenix.PubSub.broadcast!(Prymn.PubSub, "health:#{host}", health)
:ets.insert(__MODULE__, {host_address, health})
Phoenix.PubSub.broadcast!(Prymn.PubSub, "health:#{host_address}", {host_address, health})
end end
@doc false def update_and_broadcast(nil) do
def lookup(host_address) do nil
end
def update_and_broadcast(%__MODULE__{host: host} = health) do
:ets.insert(__MODULE__, {host, health})
broadcast!(health)
end
def delete(host_address) do
:ets.delete(__MODULE__, host_address)
end
def lookup(host_address, opts \\ []) do
default = Keyword.get(opts, :default, false)
case :ets.lookup(__MODULE__, host_address) do case :ets.lookup(__MODULE__, host_address) do
[{^host_address, value}] -> value [{^host_address, value}] -> value
[] when default -> %__MODULE__{host: host_address}
[] -> nil [] -> nil
end end
end end
@doc false def make_timed_out(%__MODULE__{} = health) do
def new_from_proto(proto_health) do %__MODULE__{health | message: "Connect timed out"}
case proto_health do
%HealthResponse{} = health ->
from_health(health)
%GRPC.RPCError{message: ":stream_error: :closed"} ->
%__MODULE__{message: "Disconnected"}
%GRPC.RPCError{} = error ->
require Logger
Logger.error("unhandled GRPC error received in Health module: #{inspect(error)}")
%__MODULE__{message: "Error retrieving server status"}
end
end end
defp from_health(%HealthResponse{system: system, version: version}) do def make_disconnected(%__MODULE__{} = health) do
%__MODULE__{health | message: "Disconnected"}
end
def make_from_proto(%HealthResponse{system: system, version: version, tasks: tasks}, host) do
%__MODULE__{host: host}
|> do_version(version)
|> do_system(system)
|> do_tasks(tasks)
end
defp do_version(health, version) do
%__MODULE__{health | version: version}
end
defp do_system(health, system) do
case system.status do case system.status do
"normal" -> "normal" -> %__MODULE__{health | message: "Connected"}
%__MODULE__{message: "Connected", version: version} status -> %__MODULE__{health | message: "Alert: #{status}"}
end
end
status -> defp do_tasks(health, _tasks) do
%__MODULE__{message: status, version: version} health
end
end end
end end

View file

@ -12,8 +12,6 @@ defmodule Prymn.Servers.Server do
values: [:unregistered, :registered], values: [:unregistered, :registered],
default: :unregistered default: :unregistered
field :connection_status, :string, virtual: true
timestamps() timestamps()
end end

View file

@ -1,7 +1,6 @@
defmodule PrymnWeb.ServerLive.Index do defmodule PrymnWeb.ServerLive.Index do
require Logger require Logger
alias Prymn.Agents.Health
alias Prymn.{Servers, Agents} alias Prymn.{Servers, Agents}
use PrymnWeb, :live_view use PrymnWeb, :live_view
@ -13,8 +12,9 @@ defmodule PrymnWeb.ServerLive.Index do
healths = healths =
if connected?(socket) do if connected?(socket) do
for %Servers.Server{status: :registered, public_ip: ip} <- servers, into: %{} do for %Servers.Server{status: :registered, public_ip: ip} <- servers, into: %{} do
Agents.start_connection_or_keep_alive(ip) Agents.subscribe_to_health(ip)
{ip, Agents.subscribe_to_health(ip)} Agents.start_connection(ip)
{ip, Agents.get_health(ip)}
end end
else else
%{} %{}
@ -44,8 +44,8 @@ defmodule PrymnWeb.ServerLive.Index do
|> update(:servers, fn servers -> [server | servers] end)} |> update(:servers, fn servers -> [server | servers] end)}
end end
def handle_info({host, %Prymn.Agents.Health{} = health}, socket) do def handle_info(%Agents.Health{} = health, socket) do
healths = Map.put(socket.assigns.healths, host, health) healths = Map.put(socket.assigns.healths, health.host, health)
{:noreply, assign(socket, :healths, healths)} {:noreply, assign(socket, :healths, healths)}
end end
@ -55,22 +55,32 @@ defmodule PrymnWeb.ServerLive.Index do
end end
defp server_status(assigns) do defp server_status(assigns) do
case {assigns.server, assigns.health} do case {assigns.status, assigns.health} do
{%{status: :registered}, nil} -> {:unregistered, _} ->
~H""" ~H"""
Unknown <span class="text-gray-500">Needs registration</span>
""" """
{%{status: :registered}, %Health{message: message}} -> {:registered, nil} ->
assigns = assign(assigns, :status, message)
~H""" ~H"""
<%= @status %> <span class="text-yellow-600">Connecting...</span>
""" """
{_, _} -> {:registered, %Agents.Health{message: "Connected"}} ->
~H""" ~H"""
Not registered <span class="text-green-600">Connected</span>
"""
{:registered, %Agents.Health{message: "Disconnected"}} ->
~H"""
<span class="text-red-600">Disconnected</span>
"""
{:registered, %Agents.Health{message: message}} ->
assigns = assign(assigns, :message, message)
~H"""
<span class="text-yellow-900"><%= @message %></span>
""" """
end end
end end

View file

@ -19,7 +19,7 @@
<div class="flex flex-row flex-wrap justify-between"> <div class="flex flex-row flex-wrap justify-between">
<h2 class="text-xl"><%= server.name %></h2> <h2 class="text-xl"><%= server.name %></h2>
<span class="self-center text-sm"> <span class="self-center text-sm">
<.server_status server={server} health={@healths[server.public_ip]} /> <.server_status status={server.status} health={@healths[server.public_ip]} />
</span> </span>
</div> </div>
<div class="lg:text-sm"> <div class="lg:text-sm">

View file

@ -13,17 +13,22 @@ defmodule PrymnWeb.ServerLive.Show do
server = Servers.get_server!(id) server = Servers.get_server!(id)
pid = self() pid = self()
if connected?(socket) do if connected?(socket) and server.status == :registered do
Agents.start_connection_or_keep_alive(server.public_ip) Agents.subscribe_to_health(server.public_ip)
Agents.start_connection(server.public_ip)
Task.start_link(fn -> get_sys_info(pid, server.public_ip) end) Task.start_link(fn -> get_sys_info(pid, server.public_ip) end)
end end
health = Agents.get_health(server.public_ip)
{:noreply, {:noreply,
socket socket
|> assign(:health, health || %{message: "Connecting..."})
|> assign(:page_title, server.name) |> assign(:page_title, server.name)
|> assign(:server, server) |> assign(:server, server)
|> assign(:uptime, 0) |> assign(:uptime, 0)
|> assign(:cpus, []) |> assign(:cpus, [])
|> assign(:used_disk, 0)
|> assign(:total_memory, 0) |> assign(:total_memory, 0)
|> assign(:used_memory, 0) |> assign(:used_memory, 0)
|> assign(:registration_command, Servers.create_setup_command(server))} |> assign(:registration_command, Servers.create_setup_command(server))}
@ -39,18 +44,39 @@ defmodule PrymnWeb.ServerLive.Show do
bytes_to_gigabytes(response.mem_total_bytes - response.mem_avail_bytes) bytes_to_gigabytes(response.mem_total_bytes - response.mem_avail_bytes)
) )
|> assign(:total_memory, bytes_to_gigabytes(response.mem_total_bytes)) |> assign(:total_memory, bytes_to_gigabytes(response.mem_total_bytes))
|> assign(:used_disk, calculate_disk_used_percent(response.disks))
|> assign(:cpus, response.cpus)} |> assign(:cpus, response.cpus)}
end end
def handle_info(%Agents.Health{} = health, socket) do
{:noreply, assign(socket, :health, health)}
end
defp bytes_to_gigabytes(bytes) do defp bytes_to_gigabytes(bytes) do
Float.round(bytes / Integer.pow(1024, 3), 2) Float.round(bytes / Integer.pow(1024, 3), 2)
end end
defp calculate_disk_used_percent(disks) do
alias PrymnProto.Prymn.SysInfoResponse.Disk
{used, total} =
Enum.reduce(disks, {0, 0}, fn %Disk{} = disk, {used, total} ->
{used + disk.total_bytes - disk.avail_bytes, total + disk.total_bytes}
end)
Float.round(100 * used / total, 2)
end
defp get_sys_info(from, host_address) do defp get_sys_info(from, host_address) do
channel = Agents.get_channel(host_address) alias PrymnProto.Prymn.Agent
{:ok, reply} = PrymnProto.Prymn.Agent.Stub.get_sys_info(channel, %Google.Protobuf.Empty{})
with {:ok, channel} <- Agents.get_channel(host_address),
{:ok, reply} <- Agent.Stub.get_sys_info(channel, %Google.Protobuf.Empty{}) do
send(from, reply) send(from, reply)
Process.sleep(:timer.seconds(2)) end
Process.sleep(:timer.seconds(5))
get_sys_info(from, host_address) get_sys_info(from, host_address)
end end
end end

View file

@ -1,5 +1,25 @@
<.header> <.header>
Server <%= @server.name %> <span
role="tooltip"
class="relative inline-flex h-3 w-3
before:-translate-x-1/2 before:-translate-y-full before:-top-2
before:left-1/2 before:absolute before:text-sm before:text-white
before:font-normal before:content-[attr(data-tip)] before:opacity-0
hover:before:opacity-100 before:py-1 before:px-2 before:bg-black
before:rounded before:pointer-events-none before:transition-opacity"
data-tip={@health.message}
>
<%= case @health.message do %>
<% "Connected" -> %>
<span class="absolute top-0 left-0 h-full w-full animate-ping rounded-full bg-green-400 opacity-75" />
<span class="h-3 w-3 rounded-full bg-green-500" />
<% "Disconnected" -> %>
<span class="h-3 w-3 rounded-full bg-red-500" />
<% _ -> %>
<span class="h-3 w-3 rounded-full bg-yellow-500" />
<% end %>
</span>
<span class="ml-3">Server <%= @server.name %></span>
</.header> </.header>
<section :if={@server.status == :unregistered} class="my-10"> <section :if={@server.status == :unregistered} class="my-10">
@ -38,6 +58,13 @@
</p> </p>
<p class="text-sm">Memory</p> <p class="text-sm">Memory</p>
</div> </div>
<div class="ml-4">
<p class="text-xl">
<%= @used_disk %>
<span>%</span>
</p>
<p class="text-sm">Used Disk</p>
</div>
</section> </section>
<.back navigate={~p"/servers"}>Back to servers</.back> <.back navigate={~p"/servers"}>Back to servers</.back>