Skip to content

Commit 1ec2b3d

Browse files
authored
Ensure pools without upgrades are unpaused (#73)
We immediately set the upgrade job to the final state "success" if pools did not need any upgrade. The pause/unpause functionality ignores jobs in final states to not interfere with running ones when doing a full reconcile. This left pools without upgrades stuck. This PR ensures we unpause all pools before setting the final "success" state.
1 parent 64c3201 commit 1ec2b3d

2 files changed

Lines changed: 112 additions & 3 deletions

File tree

controllers/upgradejob_controller.go

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,7 @@ func (r *UpgradeJobReconciler) reconcileStartedJob(ctx context.Context, uj *mana
176176
return ctrl.Result{}, fmt.Errorf("failed to lock cluster version: %w", err)
177177
}
178178

179-
if err := r.pauseUnpauseMachineConfigPools(ctx, uj); err != nil {
179+
if err := r.pauseUnpauseMachineConfigPools(ctx, uj, false); err != nil {
180180
return ctrl.Result{}, fmt.Errorf("failed to pause machine config pools: %w", err)
181181
}
182182

@@ -300,6 +300,11 @@ func (r *UpgradeJobReconciler) reconcileStartedJob(ctx context.Context, uj *mana
300300
return ctrl.Result{}, nil
301301
}
302302

303+
// Ensure pools that were paused but did not need an upgrade are unpaused
304+
if err := r.pauseUnpauseMachineConfigPools(ctx, uj, true); err != nil {
305+
return ctrl.Result{}, fmt.Errorf("failed to ensure machine config pools are unpaused: %w", err)
306+
}
307+
303308
// Set the upgrade as successful
304309
r.setStatusCondition(&uj.Status.Conditions, metav1.Condition{
305310
Type: managedupgradev1beta1.UpgradeJobConditionSucceeded,
@@ -846,16 +851,17 @@ func findTrackedHookJob(ujhookName, event string, uj managedupgradev1beta1.Upgra
846851

847852
// pauseUnpauseMachineConfigPools pauses or unpauses the machine config pools that match the given selectors in .Spec.MachineConfigPools and have a delay set.
848853
// The decision to pause or unpause is based on `pool.DelayUpgrade.DelayMin` relative to the startAfter time of the upgrade job.
854+
// If ensureUnpause is true, it will unpause the pools even if the delay has not expired.
849855
// It sets a timeout condition and returns an error if the delay is expired.
850856
// It also returns an error if the machine config pools cannot be listed or updated.
851-
func (r *UpgradeJobReconciler) pauseUnpauseMachineConfigPools(ctx context.Context, uj *managedupgradev1beta1.UpgradeJob) error {
857+
func (r *UpgradeJobReconciler) pauseUnpauseMachineConfigPools(ctx context.Context, uj *managedupgradev1beta1.UpgradeJob, ensureUnpause bool) error {
852858
var controllerManagesPools bool
853859
var controllerPausedPools bool
854860
for _, pool := range uj.Spec.MachineConfigPools {
855861
if pool.DelayUpgrade == (managedupgradev1beta1.UpgradeJobMachineConfigPoolDelayUpgradeSpec{}) {
856862
continue
857863
}
858-
shouldPause := r.timeSinceStartAfter(uj) < pool.DelayUpgrade.DelayMin.Duration
864+
shouldPause := !ensureUnpause && r.timeSinceStartAfter(uj) < pool.DelayUpgrade.DelayMin.Duration
859865
sel, err := metav1.LabelSelectorAsSelector(pool.MatchLabels)
860866
if err != nil {
861867
return fmt.Errorf("failed to parse machine config pool selector: %w", err)

controllers/upgradejob_controller_test.go

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1289,6 +1289,109 @@ func Test_UpgradeJobReconciler_Reconcile_PausedMachineConfigPools_UnpauseExpire(
12891289
require.Equal(t, managedupgradev1beta1.UpgradeJobReasonUnpausingPoolsExpired, failedCond.Reason, "should set reason to unpausing pools expired")
12901290
}
12911291

1292+
// Test_UpgradeJobReconciler_Reconcile_PausedMachineConfigPools_EnsureUnpause tests that the upgrade job reconciler
1293+
// will unpause machine config pools at the end of an upgrade even if they did not require any upgrades
1294+
func Test_UpgradeJobReconciler_Reconcile_PausedMachineConfigPools_EnsureUnpause(t *testing.T) {
1295+
ctx := context.Background()
1296+
clock := mockClock{now: time.Date(2022, 12, 4, 22, 45, 0, 0, time.UTC)}
1297+
1298+
ucv := &configv1.ClusterVersion{
1299+
ObjectMeta: metav1.ObjectMeta{
1300+
Name: "version",
1301+
},
1302+
Status: configv1.ClusterVersionStatus{
1303+
AvailableUpdates: []configv1.Release{
1304+
{Version: "4.5.13"},
1305+
},
1306+
Conditions: []configv1.ClusterOperatorStatusCondition{
1307+
{
1308+
Type: configv1.OperatorDegraded,
1309+
Status: configv1.ConditionFalse,
1310+
},
1311+
},
1312+
},
1313+
}
1314+
1315+
workerPool := &machineconfigurationv1.MachineConfigPool{
1316+
ObjectMeta: metav1.ObjectMeta{
1317+
Name: "worker",
1318+
Labels: map[string]string{"name": "worker"},
1319+
},
1320+
Status: machineconfigurationv1.MachineConfigPoolStatus{
1321+
MachineCount: 3,
1322+
UpdatedMachineCount: 3,
1323+
},
1324+
}
1325+
1326+
upgradeJob := &managedupgradev1beta1.UpgradeJob{
1327+
ObjectMeta: metav1.ObjectMeta{
1328+
Name: "upgrade-1234-4-5-13",
1329+
Namespace: "appuio-openshift-upgrade-controller",
1330+
},
1331+
Spec: managedupgradev1beta1.UpgradeJobSpec{
1332+
StartBefore: metav1.NewTime(clock.Now().Add(3 * time.Hour)),
1333+
StartAfter: metav1.NewTime(clock.Now().Add(-time.Minute)),
1334+
DesiredVersion: &configv1.Update{
1335+
Version: "4.5.13",
1336+
},
1337+
UpgradeJobConfig: managedupgradev1beta1.UpgradeJobConfig{
1338+
UpgradeTimeout: metav1.Duration{Duration: 12 * time.Hour},
1339+
MachineConfigPools: []managedupgradev1beta1.UpgradeJobMachineConfigPoolSpec{
1340+
{
1341+
MatchLabels: &metav1.LabelSelector{
1342+
MatchLabels: map[string]string{"name": "worker"},
1343+
},
1344+
DelayUpgrade: managedupgradev1beta1.UpgradeJobMachineConfigPoolDelayUpgradeSpec{
1345+
DelayMin: metav1.Duration{Duration: 1 * time.Hour},
1346+
DelayMax: metav1.Duration{Duration: 2 * time.Hour},
1347+
},
1348+
},
1349+
},
1350+
},
1351+
},
1352+
}
1353+
1354+
client := controllerClient(t, ucv, upgradeJob, workerPool)
1355+
1356+
subject := &UpgradeJobReconciler{
1357+
Client: client,
1358+
Scheme: client.Scheme(),
1359+
1360+
Clock: &clock,
1361+
1362+
ManagedUpstreamClusterVersionName: "version",
1363+
}
1364+
1365+
t.Log("check that upgrade job is started and machine config pools are paused")
1366+
reconcileNTimes(t, subject, ctx, requestForObject(upgradeJob), 10)
1367+
require.NoError(t, client.Get(ctx, requestForObject(upgradeJob).NamespacedName, upgradeJob))
1368+
startedCond := apimeta.FindStatusCondition(upgradeJob.Status.Conditions, managedupgradev1beta1.UpgradeJobConditionStarted)
1369+
require.NotNil(t, startedCond, "should have started upgrade")
1370+
pausedCond := apimeta.FindStatusCondition(upgradeJob.Status.Conditions, managedupgradev1beta1.UpgradeJobConditionMachineConfigPoolsPaused)
1371+
require.NotNil(t, pausedCond, "should have paused mcp upgrades")
1372+
require.Equal(t, metav1.ConditionTrue, pausedCond.Status)
1373+
require.NoError(t, client.Get(ctx, requestForObject(workerPool).NamespacedName, workerPool))
1374+
require.True(t, workerPool.Spec.Paused, "should have paused worker mcp")
1375+
1376+
t.Log("finish the upgrade")
1377+
require.NoError(t, client.Get(ctx, requestForObject(ucv).NamespacedName, ucv))
1378+
ucv.Status.History = append(ucv.Status.History, configv1.UpdateHistory{
1379+
State: configv1.CompletedUpdate,
1380+
Version: upgradeJob.Spec.DesiredVersion.Version,
1381+
Image: upgradeJob.Spec.DesiredVersion.Image,
1382+
})
1383+
require.NoError(t, client.Status().Update(ctx, ucv))
1384+
reconcileNTimes(t, subject, ctx, requestForObject(upgradeJob), 5)
1385+
1386+
t.Log("check that job is done and ensure machine config pools are unpaused")
1387+
require.NoError(t, client.Get(ctx, requestForObject(upgradeJob).NamespacedName, upgradeJob))
1388+
succeededCond := apimeta.FindStatusCondition(upgradeJob.Status.Conditions, managedupgradev1beta1.UpgradeJobConditionSucceeded)
1389+
require.NotNil(t, succeededCond)
1390+
require.Equal(t, metav1.ConditionTrue, succeededCond.Status)
1391+
require.NoError(t, client.Get(ctx, requestForObject(workerPool).NamespacedName, workerPool))
1392+
require.False(t, workerPool.Spec.Paused, "should have unpaused worker mcp for completed job")
1393+
}
1394+
12921395
func Test_JobFromClusterVersionHandler(t *testing.T) {
12931396
ucv := &configv1.ClusterVersion{
12941397
ObjectMeta: metav1.ObjectMeta{

0 commit comments

Comments
 (0)