Skip to content

Commit 9465251

Browse files
author
Perttu Savolainen
authored
feat: add default descriptions to alerts (#10)
* feat: add default descriptions to alerts
1 parent 4ff10ec commit 9465251

2 files changed

Lines changed: 95 additions & 20 deletions

File tree

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
export const defaultAlarmDescriptionTemplate = `
2+
Consider the following:
3+
- Be precise: character limit of 1024 for the alert description
4+
- Actionability: Is it actionable, is the alert even needed?
5+
- Prefer examples: share e.g. CloudWatch Insights queries
6+
- Reusability: If instruction is generic open a PR to mca-cli :)
7+
`.trim();
8+
9+
export const lambdaErrorsAlarmDescriptionTemplate = `
10+
- Evaluate the criticality of alert:
11+
* Check the amount of errors
12+
* If there are a lot of errors inform the product owner immediately
13+
- Find the requestId of the error with CloudWatch Insights query:
14+
fields @timestamp, @message
15+
| sort @timestamp desc
16+
| filter @message like /ERROR/
17+
- Get the logs for the requestId:
18+
fields @timestamp, @message
19+
| sort @timestamp desc
20+
| filter @requestId = "requestIdHere"
21+
- Check if a development ticket exists of this issue
22+
* If not create one
23+
`.trim();
24+
25+
export const lambdaDurationAlarmDescriptionTemplate = `
26+
- Check metric history for changes to durations
27+
- Evaluate whether alarm threshold or applications needs to change
28+
- CloudWatch Insights query to find offending durations:
29+
fields @timestamp, @message
30+
| sort @timestamp desc
31+
| filter @duration > durationThresholdHere
32+
`.trim();
33+
34+
export const lambdaInvocationsAlarmDescriptionTemplate = `
35+
- Check metric history for changes to invocations
36+
- Evaluate whether alarm threshold or applications needs to change
37+
- CloudWatch Insights query to check the invocation counts:
38+
fields @timestamp, @message
39+
| sort @timestamp desc
40+
| filter @message like /START RequestId:/
41+
| stats count() by bin(5m)
42+
`.trim();
43+
44+
export const lambdaThrottlesAlarmDescriptionTemplate = `
45+
- Check metric history for throttles
46+
- Evaluate the severity
47+
* Check how retry logic has been implemented
48+
-> if retry logic is missing, issue is CRITICAL
49+
* If situation is critical and urgent, request
50+
concurrency limit extension from AWS support immediately
51+
* Check whether the issue affects data integrity
52+
* Add retry logic if it's missing!
53+
`.trim();
54+
55+
export const lambda = {
56+
errors: lambdaErrorsAlarmDescriptionTemplate,
57+
duration: lambdaDurationAlarmDescriptionTemplate,
58+
invocations: lambdaInvocationsAlarmDescriptionTemplate,
59+
throttles: lambdaThrottlesAlarmDescriptionTemplate,
60+
};

src/lib/monitoring/config.ts

Lines changed: 35 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -13,11 +13,17 @@ import {
1313
ConfigCustomDefaults,
1414
ConfigLogGroupAlarms,
1515
} from './types';
16+
import * as descriptions from './alarmDescriptions';
1617
import { Args, AWSItem } from './types';
1718
import diff from './diff';
1819

1920
type AlarmMetricConfig = ConfigLocals<ConfigMetricAlarms>;
2021

22+
const defaultGenericCriticalConfig = {
23+
evaluationPeriods: 1,
24+
alarmDescription: descriptions.defaultAlarmDescriptionTemplate,
25+
};
26+
2127
export class ConfigGenerator {
2228
private config: Config;
2329

@@ -346,8 +352,9 @@ export class ConfigGenerator {
346352
autoResolve: false,
347353
alarm: {
348354
critical: {
355+
...defaultGenericCriticalConfig,
356+
alarmDescription: descriptions.lambda.errors,
349357
threshold: 1,
350-
evaluationPeriods: 1,
351358
},
352359
},
353360
metric: {
@@ -360,8 +367,9 @@ export class ConfigGenerator {
360367
autoResolve: false,
361368
alarm: {
362369
critical: {
370+
...defaultGenericCriticalConfig,
371+
alarmDescription: descriptions.lambda.invocations,
363372
threshold: 1000,
364-
evaluationPeriods: 1,
365373
},
366374
},
367375
metric: {
@@ -374,8 +382,9 @@ export class ConfigGenerator {
374382
autoResolve: false,
375383
alarm: {
376384
critical: {
385+
...defaultGenericCriticalConfig,
386+
alarmDescription: descriptions.lambda.duration,
377387
threshold: 2000,
378-
evaluationPeriods: 1,
379388
},
380389
},
381390
metric: {
@@ -388,8 +397,9 @@ export class ConfigGenerator {
388397
autoResolve: false,
389398
alarm: {
390399
critical: {
400+
...defaultGenericCriticalConfig,
401+
alarmDescription: descriptions.lambda.throttles,
391402
threshold: 1,
392-
evaluationPeriods: 1,
393403
},
394404
},
395405
metric: {
@@ -432,8 +442,8 @@ export class ConfigGenerator {
432442
autoResolve: false,
433443
alarm: {
434444
critical: {
445+
...defaultGenericCriticalConfig,
435446
threshold: 100,
436-
evaluationPeriods: 1,
437447
},
438448
},
439449
metric: {
@@ -446,8 +456,8 @@ export class ConfigGenerator {
446456
autoResolve: false,
447457
alarm: {
448458
critical: {
459+
...defaultGenericCriticalConfig,
449460
threshold: 200,
450-
evaluationPeriods: 1,
451461
},
452462
},
453463
metric: {
@@ -460,8 +470,8 @@ export class ConfigGenerator {
460470
autoResolve: false,
461471
alarm: {
462472
critical: {
473+
...defaultGenericCriticalConfig,
463474
threshold: 2000,
464-
evaluationPeriods: 1,
465475
},
466476
},
467477
metric: {
@@ -474,8 +484,8 @@ export class ConfigGenerator {
474484
autoResolve: false,
475485
alarm: {
476486
critical: {
487+
...defaultGenericCriticalConfig,
477488
threshold: 10,
478-
evaluationPeriods: 1,
479489
},
480490
},
481491
metric: {
@@ -550,8 +560,8 @@ export class ConfigGenerator {
550560
enabled: true,
551561
alarm: {
552562
critical: {
563+
...defaultGenericCriticalConfig,
553564
threshold: 90,
554-
evaluationPeriods: 1,
555565
},
556566
},
557567
metric: {
@@ -563,8 +573,8 @@ export class ConfigGenerator {
563573
enabled: true,
564574
alarm: {
565575
critical: {
576+
...defaultGenericCriticalConfig,
566577
threshold: 90,
567-
evaluationPeriods: 1,
568578
},
569579
},
570580
metric: {
@@ -600,8 +610,8 @@ export class ConfigGenerator {
600610
enabled: true,
601611
alarm: {
602612
critical: {
613+
...defaultGenericCriticalConfig,
603614
threshold: 1,
604-
evaluationPeriods: 1,
605615
},
606616
},
607617
metric: {
@@ -641,8 +651,8 @@ export class ConfigGenerator {
641651
enabled: true,
642652
alarm: {
643653
critical: {
654+
...defaultGenericCriticalConfig,
644655
threshold: 1,
645-
evaluationPeriods: 1,
646656
},
647657
},
648658
metric: {
@@ -689,6 +699,7 @@ export class ConfigGenerator {
689699
enabled: true,
690700
alarm: {
691701
critical: {
702+
...defaultGenericCriticalConfig,
692703
threshold: 75,
693704
evaluationPeriods: 5,
694705
},
@@ -703,8 +714,8 @@ export class ConfigGenerator {
703714
enabled: true,
704715
alarm: {
705716
critical: {
717+
...defaultGenericCriticalConfig,
706718
threshold: 1000000000, // 1GB
707-
evaluationPeriods: 1,
708719
comparisonOperator: 'LESS_THAN_THRESHOLD',
709720
},
710721
},
@@ -718,8 +729,8 @@ export class ConfigGenerator {
718729
enabled: true,
719730
alarm: {
720731
critical: {
732+
...defaultGenericCriticalConfig,
721733
threshold: 25,
722-
evaluationPeriods: 1,
723734
},
724735
},
725736
metric: {
@@ -732,8 +743,8 @@ export class ConfigGenerator {
732743
enabled: true,
733744
alarm: {
734745
critical: {
746+
...defaultGenericCriticalConfig,
735747
threshold: 75000000, // 75MB
736-
evaluationPeriods: 1,
737748
comparisonOperator: 'LESS_THAN_THRESHOLD',
738749
},
739750
},
@@ -747,8 +758,8 @@ export class ConfigGenerator {
747758
enabled: true,
748759
alarm: {
749760
critical: {
761+
...defaultGenericCriticalConfig,
750762
threshold: 1,
751-
evaluationPeriods: 1,
752763
},
753764
},
754765
metric: {
@@ -761,6 +772,7 @@ export class ConfigGenerator {
761772
enabled: true,
762773
alarm: {
763774
critical: {
775+
...defaultGenericCriticalConfig,
764776
threshold: 2,
765777
evaluationPeriods: 1,
766778
},
@@ -775,8 +787,8 @@ export class ConfigGenerator {
775787
enabled: true,
776788
alarm: {
777789
critical: {
790+
...defaultGenericCriticalConfig,
778791
threshold: 60,
779-
evaluationPeriods: 1,
780792
},
781793
},
782794
metric: {
@@ -808,7 +820,7 @@ export class ConfigGenerator {
808820
this.config = {
809821
...this.config,
810822
rdsInstances: rdsInstances.reduce(
811-
(acc, i) => ({ ...acc, [i.DBInstanceIdentifier || '']: {} }),
823+
(acc, instance) => ({ ...acc, [instance.DBInstanceIdentifier || '']: {} }),
812824
{} as AlarmMetricConfig,
813825
),
814826
custom: {
@@ -832,8 +844,8 @@ export class ConfigGenerator {
832844
enabled: true,
833845
alarm: {
834846
critical: {
847+
...defaultGenericCriticalConfig,
835848
threshold: 1,
836-
evaluationPeriods: 1,
837849
},
838850
},
839851
metric: {
@@ -845,6 +857,7 @@ export class ConfigGenerator {
845857
enabled: true,
846858
alarm: {
847859
critical: {
860+
...defaultGenericCriticalConfig,
848861
threshold: 75,
849862
evaluationPeriods: 5,
850863
},
@@ -859,6 +872,7 @@ export class ConfigGenerator {
859872
enabled: true,
860873
alarm: {
861874
critical: {
875+
...defaultGenericCriticalConfig,
862876
threshold: 75,
863877
evaluationPeriods: 5,
864878
},
@@ -873,6 +887,7 @@ export class ConfigGenerator {
873887
enabled: true,
874888
alarm: {
875889
critical: {
890+
...defaultGenericCriticalConfig,
876891
threshold: 20,
877892
evaluationPeriods: 1,
878893
},
@@ -929,8 +944,8 @@ export class ConfigGenerator {
929944
enabled: true,
930945
alarm: {
931946
critical: {
947+
...defaultGenericCriticalConfig,
932948
threshold: 10,
933-
evaluationPeriods: 1,
934949
},
935950
},
936951
metric: {

0 commit comments

Comments
 (0)