Hi,
So I am struggling to create a monitor log query that tracks running VMs that have Agent Status as "Not Ready".
My understanding is that using Heartbeat helps provide information like Agent State (Healthy or Unhealthy), which is the same as Agent Status (Ready or Not Ready)?
I am using the following query:
let AllAgentsData = materialize(
Heartbeat
| where TimeGenerated > ago(1d)
| summarize
FirstHeartbeat=min(TimeGenerated),
LastHeartbeat=max(TimeGenerated),
arg_max(TimeGenerated, OSType, OSName)
by
_ResourceId,
ResourceType,
Computer,
AgentCategory=Category,
AgentVersion=Version,
SubscriptionId,
lowerRG=tolower(ResourceGroup),
ComputerEnvironment
| extend ActivityState = iff(datetime_diff('minute', now(), LastHeartbeat) >= 60, 'Inactive', 'Active'));
let RelevantAgents =
AllAgentsData
| summarize
AgentsActivityStates=makelist(ActivityState),
AgentVersions=array_sort_asc(makelist(AgentVersion)),
ActiveAgentVersions=array_sort_asc(make_list_if(AgentVersion, ActivityState == "Active")),
LastHeartbeat=max(LastHeartbeat),
CurrentVersionStartTime=max(FirstHeartbeat)
by _ResourceId, ResourceType, Computer, AgentCategory
| extend HealthState = iff(AgentsActivityStates has "Active", 'Healthy', 'Unhealthy')
| where HealthState != 'Healthy';
let RelevantAgentsData =
RelevantAgents
| join kind=inner (AllAgentsData) on _ResourceId, ResourceType, Computer, AgentCategory;
let ActivityTrend =
Heartbeat
| where TimeGenerated > ago(1d)
| where _ResourceId in (RelevantAgentsData)
| make-series HeartbeatsTrend=count()
on TimeGenerated
from (ago(1d)) to (now()) step (30m)
by
_ResourceId,
ResourceType,
Computer,
AgentCategory=Category,
SubscriptionId,
lowerRG=tolower(ResourceGroup),
ComputerEnvironment;
RelevantAgentsData
| extend Details=case(
array_length(ActiveAgentVersions) == 0,
strcat("Agent stopped, last heartbeat: ", format_datetime(LastHeartbeat, 'yyyy-MM-dd HH:mm:ss'), " UTC"),
array_length(AgentVersions) > 1 and array_length(ActiveAgentVersions) == 1,
strcat("Agent version upgraded at ", format_datetime(CurrentVersionStartTime, 'yyyy-MM-dd HH:mm:ss'), " UTC"),
array_length(AgentVersions) > 1 and array_length(ActiveAgentVersions) > 1,
strcat("Multiple versions installed: ", strcat_array(AgentVersions, ", ")),
""
)
| summarize
SubscriptionId=any(SubscriptionId),
ComputerEnvironment=any(ComputerEnvironment),
OSType=max(OSType),
OSName=max(OSName),
Details=any(Details)
by
_ResourceId,
ResourceType,
Computer,
AgentCategory,
HealthState,
AgentVersion=iff(array_length(ActiveAgentVersions) > 0, strcat_array(ActiveAgentVersions, ", "), strcat_array(AgentVersions, ","))
| join kind=inner (ActivityTrend)
on
_ResourceId,
ResourceType,
Computer,
AgentCategory,
SubscriptionId,
ComputerEnvironment
| project
Resource=_ResourceId,
["Resource type"]=ResourceType,
Computer,
OS=iff(strlen(OSName) > 0, strcat(OSType, " (", OSName, ")"), OSType),
["Agent category"]=AgentCategory,
["Agent version"]=AgentVersion,
["Heartbeats trend"]=HeartbeatsTrend,
["Agent state"]=HealthState,
Details,
Subscription=SubscriptionId,
["Resource group"]=lowerRG,
Environment=ComputerEnvironment
| sort by Resource asc, Computer asc, ["Agent category"] asc, ["Agent version"] asc
This issue with this query, is that I manually stopped walinuxagent on the VM. So now I can see the VM is showing Agent Status "Not Ready". But running above still shows the VM as Healthy.
Log Output:
Virtual machine info via Azure Portal:
Can you help provide an Azure Monitor Log query that tracks running VMs with Agent Status "Not Ready" please?