Skip to content

Commit c58508c

Browse files
authored
Merge pull request #92 from alerta/heartbeats-alert
Alert on stale or slow heartbeats
2 parents f69f4cc + 93fef3f commit c58508c

1 file changed

Lines changed: 57 additions & 3 deletions

File tree

alertaclient/commands/cmd_heartbeats.py

Lines changed: 57 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,15 @@
22
import click
33

44
from tabulate import tabulate
5+
from alertaclient.models.heartbeat import MAX_LATENCY
56

67

78
@click.command('heartbeats', short_help='List heartbeats')
9+
@click.option('--alert', is_flag=True, help='Send alerts on stale or slow heartbeats')
10+
@click.option('--severity', '-s', metavar='SEVERITY', default='major', help='Set the severity for stale heartbeat alerts')
811
@click.option('--purge', is_flag=True, help='Delete stale heartbeats')
912
@click.pass_obj
10-
def cli(obj, purge):
13+
def cli(obj, alert, severity, purge):
1114
"""List heartbeats."""
1215
client = obj['client']
1316
timezone = obj['timezone']
@@ -18,8 +21,59 @@ def cli(obj, purge):
1821
heartbeats = client.get_heartbeats()
1922
click.echo(tabulate([h.tabular(timezone) for h in heartbeats], headers=headers, tablefmt=obj['output']))
2023

21-
expired = [hb for hb in heartbeats if hb.status == 'expired']
24+
not_ok = [hb for hb in heartbeats if hb.status != 'ok']
2225
if purge:
23-
with click.progressbar(expired, label='Purging {} heartbeats'.format(len(expired))) as bar:
26+
with click.progressbar(not_ok, label='Purging {} heartbeats'.format(len(not_ok))) as bar:
2427
for b in bar:
2528
client.delete_heartbeat(b.id)
29+
30+
elif alert:
31+
with click.progressbar(heartbeats, label='Alerting {} heartbeats'.format(len(heartbeats))) as bar:
32+
for b in bar:
33+
params = dict(filter(lambda a: len(a) == 2, map(lambda a: a.split(':'), b.tags)))
34+
environment = params.get('environment', 'Production')
35+
group = params.get('group', 'System')
36+
tags = list(filter(lambda a: not a.startswith('environment:') and not a.startswith('group:'), b.tags))
37+
38+
if b.status == 'expired': # aka. "stale"
39+
client.send_alert(
40+
resource=b.origin,
41+
event='HeartbeatFail',
42+
correlate=['HeartbeatFail', 'HeartbeatSlow', 'HeartbeatOK'],
43+
group=group,
44+
environment=environment,
45+
service=['Alerta'],
46+
severity=severity,
47+
value='{}'.format(b.since),
48+
text='Heartbeat not received in {} seconds'.format(b.timeout),
49+
tags=tags,
50+
type='heartbeatAlert'
51+
)
52+
elif b.status == 'slow':
53+
client.send_alert(
54+
resource=b.origin,
55+
event='HeartbeatSlow',
56+
correlate=['HeartbeatFail', 'HeartbeatSlow', 'HeartbeatOK'],
57+
group=group,
58+
environment=environment,
59+
service=['Alerta'],
60+
severity=severity,
61+
value='{}ms'.format(b.latency),
62+
text='Heartbeat took more than {}ms to be processed'.format(MAX_LATENCY),
63+
tags=tags,
64+
type='heartbeatAlert'
65+
)
66+
else:
67+
client.send_alert(
68+
resource=b.origin,
69+
event='HeartbeatOK',
70+
correlate=['HeartbeatFail', 'HeartbeatSlow', 'HeartbeatOK'],
71+
group=group,
72+
environment=environment,
73+
service=['Alerta'],
74+
severity='normal',
75+
value='',
76+
text='Heartbeat OK',
77+
tags=tags,
78+
type='heartbeatAlert'
79+
)

0 commit comments

Comments
 (0)