Skip to content

Commit abcb1ee

Browse files
committed
HBASE-23632 DeadServer cleanup (#979)
Signed-off-by: Bharath Vissapragada <bharathv@apache.org>
1 parent db7fb06 commit abcb1ee

5 files changed

Lines changed: 103 additions & 125 deletions

File tree

Lines changed: 70 additions & 95 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
1-
/**
2-
*
1+
/*
32
* Licensed to the Apache Software Foundation (ASF) under one
43
* or more contributor license agreements. See the NOTICE file
54
* distributed with this work for additional information
@@ -20,7 +19,6 @@
2019

2120
import java.util.ArrayList;
2221
import java.util.Collections;
23-
import java.util.Comparator;
2422
import java.util.Date;
2523
import java.util.HashMap;
2624
import java.util.HashSet;
@@ -40,7 +38,13 @@
4038

4139
/**
4240
* Class to hold dead servers list and utility querying dead server list.
43-
* On znode expiration, servers are added here.
41+
* Servers are added when they expire or when we find them in filesystem on startup.
42+
* When a server crash procedure is queued, it will populate the processing list and
43+
* then remove the server from processing list when done. Servers are removed from
44+
* dead server list when a new instance is started over the old on same hostname and
45+
* port or when new Master comes online tidying up after all initialization. Processing
46+
* list and deadserver list are not tied together (you don't have to be in deadservers
47+
* list to be processing and vice versa).
4448
*/
4549
@InterfaceAudience.Private
4650
public class DeadServer {
@@ -56,37 +60,11 @@ public class DeadServer {
5660
private final Map<ServerName, Long> deadServers = new HashMap<>();
5761

5862
/**
59-
* Set of dead servers currently being processed
60-
*/
61-
private final Set<ServerName> processingServers = new HashSet<ServerName>();
62-
63-
/**
64-
* Handles restart of a server. The new server instance has a different start code.
65-
* The new start code should be greater than the old one. We don't check that here.
66-
*
67-
* @param newServerName Servername as either <code>host:port</code> or
68-
* <code>host,port,startcode</code>.
69-
* @return true if this server was dead before and coming back alive again
63+
* Set of dead servers currently being processed by a SCP.
64+
* Added to this list at the start of SCP and removed after it is done
65+
* processing the crash.
7066
*/
71-
public synchronized boolean cleanPreviousInstance(final ServerName newServerName) {
72-
Iterator<ServerName> it = deadServers.keySet().iterator();
73-
while (it.hasNext()) {
74-
ServerName sn = it.next();
75-
if (ServerName.isSameAddress(sn, newServerName)) {
76-
// remove from deadServers
77-
it.remove();
78-
// remove from processingServers
79-
boolean removed = processingServers.remove(sn);
80-
if (removed) {
81-
LOG.debug("Removed {}, processing={}, numProcessing={}", sn, removed,
82-
processingServers.size());
83-
}
84-
return true;
85-
}
86-
}
87-
88-
return false;
89-
}
67+
private final Set<ServerName> processingServers = new HashSet<>();
9068

9169
/**
9270
* @param serverName server name.
@@ -96,22 +74,14 @@ public synchronized boolean isDeadServer(final ServerName serverName) {
9674
return deadServers.containsKey(serverName);
9775
}
9876

99-
/**
100-
* @param serverName server name.
101-
* @return true if this server is on the processing servers list false otherwise
102-
*/
103-
public synchronized boolean isProcessingServer(final ServerName serverName) {
104-
return processingServers.contains(serverName);
105-
}
106-
10777
/**
10878
* Checks if there are currently any dead servers being processed by the
10979
* master. Returns true if at least one region server is currently being
11080
* processed as dead.
11181
*
11282
* @return true if any RS are being processed as dead
11383
*/
114-
public synchronized boolean areDeadServersInProgress() {
84+
synchronized boolean areDeadServersInProgress() {
11585
return !processingServers.isEmpty();
11686
}
11787

@@ -124,72 +94,90 @@ public synchronized Set<ServerName> copyServerNames() {
12494
/**
12595
* Adds the server to the dead server list if it's not there already.
12696
*/
127-
public synchronized void add(ServerName sn) {
128-
if (!deadServers.containsKey(sn)){
129-
deadServers.put(sn, EnvironmentEdgeManager.currentTime());
130-
}
131-
boolean added = processingServers.add(sn);
132-
if (LOG.isDebugEnabled() && added) {
133-
LOG.debug("Added " + sn + "; numProcessing=" + processingServers.size());
134-
}
97+
synchronized void putIfAbsent(ServerName sn) {
98+
this.deadServers.putIfAbsent(sn, EnvironmentEdgeManager.currentTime());
99+
processing(sn);
135100
}
136101

137102
/**
138-
* Notify that we started processing this dead server.
139-
* @param sn ServerName for the dead server.
103+
* Add <code>sn<</code> to set of processing deadservers.
104+
* @see #finish(ServerName)
140105
*/
141-
public synchronized void notifyServer(ServerName sn) {
142-
boolean added = processingServers.add(sn);
143-
if (LOG.isDebugEnabled()) {
144-
if (added) {
145-
LOG.debug("Added " + sn + "; numProcessing=" + processingServers.size());
146-
}
147-
LOG.debug("Started processing " + sn + "; numProcessing=" + processingServers.size());
106+
public synchronized void processing(ServerName sn) {
107+
if (processingServers.add(sn)) {
108+
// Only log on add.
109+
LOG.debug("Processing {}; numProcessing={}", sn, processingServers.size());
148110
}
149111
}
150112

151113
/**
152114
* Complete processing for this dead server.
153115
* @param sn ServerName for the dead server.
116+
* @see #processing(ServerName)
154117
*/
155118
public synchronized void finish(ServerName sn) {
156-
boolean removed = processingServers.remove(sn);
157-
if (LOG.isDebugEnabled()) {
158-
LOG.debug("Finished processing " + sn + "; numProcessing=" + processingServers.size());
159-
if (removed) {
160-
LOG.debug("Removed " + sn + " ; numProcessing=" + processingServers.size());
161-
}
119+
if (processingServers.remove(sn)) {
120+
LOG.debug("Removed {} from processing; numProcessing={}", sn, processingServers.size());
162121
}
163122
}
164123

165124
public synchronized int size() {
166125
return deadServers.size();
167126
}
168127

169-
public synchronized boolean isEmpty() {
128+
synchronized boolean isEmpty() {
170129
return deadServers.isEmpty();
171130
}
172131

173-
public synchronized void cleanAllPreviousInstances(final ServerName newServerName) {
132+
/**
133+
* Handles restart of a server. The new server instance has a different start code.
134+
* The new start code should be greater than the old one. We don't check that here.
135+
* Removes the old server from deadserver list.
136+
*
137+
* @param newServerName Servername as either <code>host:port</code> or
138+
* <code>host,port,startcode</code>.
139+
* @return true if this server was dead before and coming back alive again
140+
*/
141+
synchronized boolean cleanPreviousInstance(final ServerName newServerName) {
174142
Iterator<ServerName> it = deadServers.keySet().iterator();
175143
while (it.hasNext()) {
176-
ServerName sn = it.next();
177-
if (ServerName.isSameAddress(sn, newServerName)) {
178-
// remove from deadServers
179-
it.remove();
180-
// remove from processingServers
181-
boolean removed = processingServers.remove(sn);
182-
if (removed) {
183-
LOG.debug("Removed " + sn + " ; numProcessing=" + processingServers.size());
184-
}
144+
if (cleanOldServerName(newServerName, it)) {
145+
return true;
185146
}
186147
}
148+
return false;
149+
}
150+
151+
synchronized void cleanAllPreviousInstances(final ServerName newServerName) {
152+
Iterator<ServerName> it = deadServers.keySet().iterator();
153+
while (it.hasNext()) {
154+
cleanOldServerName(newServerName, it);
155+
}
156+
}
157+
158+
/**
159+
* @param newServerName Server to match port and hostname against.
160+
* @param deadServerIterator Iterator primed so can call 'next' on it.
161+
* @return True if <code>newServerName</code> and current primed
162+
* iterator ServerName have same host and port and we removed old server
163+
* from iterator and from processing list.
164+
*/
165+
private boolean cleanOldServerName(ServerName newServerName,
166+
Iterator<ServerName> deadServerIterator) {
167+
ServerName sn = deadServerIterator.next();
168+
if (ServerName.isSameAddress(sn, newServerName)) {
169+
// Remove from dead servers list. Don't remove from the processing list --
170+
// let the SCP do it when it is done.
171+
deadServerIterator.remove();
172+
return true;
173+
}
174+
return false;
187175
}
188176

189177
@Override
190178
public synchronized String toString() {
191179
// Display unified set of servers from both maps
192-
Set<ServerName> servers = new HashSet<ServerName>();
180+
Set<ServerName> servers = new HashSet<>();
193181
servers.addAll(deadServers.keySet());
194182
servers.addAll(processingServers);
195183
StringBuilder sb = new StringBuilder();
@@ -211,7 +199,7 @@ public synchronized String toString() {
211199
* @param ts the time, 0 for all
212200
* @return a sorted array list, by death time, lowest values first.
213201
*/
214-
public synchronized List<Pair<ServerName, Long>> copyDeadServersSince(long ts){
202+
synchronized List<Pair<ServerName, Long>> copyDeadServersSince(long ts) {
215203
List<Pair<ServerName, Long>> res = new ArrayList<>(size());
216204

217205
for (Map.Entry<ServerName, Long> entry:deadServers.entrySet()){
@@ -220,7 +208,7 @@ public synchronized List<Pair<ServerName, Long>> copyDeadServersSince(long ts){
220208
}
221209
}
222210

223-
Collections.sort(res, ServerNameDeathDateComparator);
211+
Collections.sort(res, (o1, o2) -> o1.getSecond().compareTo(o2.getSecond()));
224212
return res;
225213
}
226214

@@ -234,28 +222,15 @@ public synchronized Date getTimeOfDeath(final ServerName deadServerName){
234222
return time == null ? null : new Date(time);
235223
}
236224

237-
private static Comparator<Pair<ServerName, Long>> ServerNameDeathDateComparator =
238-
new Comparator<Pair<ServerName, Long>>(){
239-
240-
@Override
241-
public int compare(Pair<ServerName, Long> o1, Pair<ServerName, Long> o2) {
242-
return o1.getSecond().compareTo(o2.getSecond());
243-
}
244-
};
245-
246225
/**
247-
* remove the specified dead server
226+
* Called from rpc by operator cleaning up deadserver list.
248227
* @param deadServerName the dead server name
249228
* @return true if this server was removed
250229
*/
251-
252230
public synchronized boolean removeDeadServer(final ServerName deadServerName) {
253231
Preconditions.checkState(!processingServers.contains(deadServerName),
254232
"Asked to remove server still in processingServers set " + deadServerName +
255233
" (numProcessing=" + processingServers.size() + ")");
256-
if (deadServers.remove(deadServerName) == null) {
257-
return false;
258-
}
259-
return true;
234+
return this.deadServers.remove(deadServerName) != null;
260235
}
261236
}

hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java

Lines changed: 16 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -345,7 +345,7 @@ boolean checkAndRecordNewServer(final ServerName serverName, final ServerMetrics
345345
*/
346346
void findDeadServersAndProcess(Set<ServerName> deadServersFromPE,
347347
Set<ServerName> liveServersFromWALDir) {
348-
deadServersFromPE.forEach(deadservers::add);
348+
deadServersFromPE.forEach(deadservers::putIfAbsent);
349349
liveServersFromWALDir.stream().filter(sn -> !onlineServers.containsKey(sn))
350350
.forEach(this::expireServer);
351351
}
@@ -376,6 +376,8 @@ private void checkClockSkew(final ServerName serverName, final long serverCurren
376376
}
377377

378378
/**
379+
* Called when RegionServer first reports in for duty and thereafter each
380+
* time it heartbeats to make sure it is has not been figured for dead.
379381
* If this server is on the dead list, reject it with a YouAreDeadException.
380382
* If it was dead but came back with a new start code, remove the old entry
381383
* from the dead list.
@@ -384,21 +386,20 @@ private void checkClockSkew(final ServerName serverName, final long serverCurren
384386
private void checkIsDead(final ServerName serverName, final String what)
385387
throws YouAreDeadException {
386388
if (this.deadservers.isDeadServer(serverName)) {
387-
// host name, port and start code all match with existing one of the
388-
// dead servers. So, this server must be dead.
389+
// Exact match: host name, port and start code all match with existing one of the
390+
// dead servers. So, this server must be dead. Tell it to kill itself.
389391
String message = "Server " + what + " rejected; currently processing " +
390392
serverName + " as dead server";
391393
LOG.debug(message);
392394
throw new YouAreDeadException(message);
393395
}
394-
// remove dead server with same hostname and port of newly checking in rs after master
395-
// initialization.See HBASE-5916 for more information.
396-
if ((this.master == null || this.master.isInitialized())
397-
&& this.deadservers.cleanPreviousInstance(serverName)) {
396+
// Remove dead server with same hostname and port of newly checking in rs after master
397+
// initialization. See HBASE-5916 for more information.
398+
if ((this.master == null || this.master.isInitialized()) &&
399+
this.deadservers.cleanPreviousInstance(serverName)) {
398400
// This server has now become alive after we marked it as dead.
399401
// We removed it's previous entry from the dead list to reflect it.
400-
LOG.debug(what + ":" + " Server " + serverName + " came back up," +
401-
" removed it from the dead servers list");
402+
LOG.debug("{} {} came back up, removed it from the dead servers list", what, serverName);
402403
}
403404
}
404405

@@ -609,7 +610,10 @@ synchronized long expireServer(final ServerName serverName, boolean force) {
609610
return pid;
610611
}
611612

612-
// Note: this is currently invoked from RPC, not just tests. Locking in this class needs cleanup.
613+
/**
614+
* Called when server has expired.
615+
*/
616+
// Locking in this class needs cleanup.
613617
@VisibleForTesting
614618
public synchronized void moveFromOnlineToDeadServers(final ServerName sn) {
615619
synchronized (this.onlineServers) {
@@ -618,7 +622,7 @@ public synchronized void moveFromOnlineToDeadServers(final ServerName sn) {
618622
// Remove the server from the known servers lists and update load info BUT
619623
// add to deadservers first; do this so it'll show in dead servers list if
620624
// not in online servers list.
621-
this.deadservers.add(sn);
625+
this.deadservers.putIfAbsent(sn);
622626
this.onlineServers.remove(sn);
623627
onlineServers.notifyAll();
624628
} else {
@@ -872,7 +876,7 @@ public synchronized ServerLiveState isServerKnownAndOnline(ServerName serverName
872876
/**
873877
* Check if a server is known to be dead. A server can be online,
874878
* or known to be dead, or unknown to this manager (i.e, not online,
875-
* not known to be dead either. it is simply not tracked by the
879+
* not known to be dead either; it is simply not tracked by the
876880
* master any more, for example, a very old previous instance).
877881
*/
878882
public synchronized boolean isServerDead(ServerName serverName) {

hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,7 @@ protected Flow executeFromState(MasterProcedureEnv env, ServerCrashState state)
124124
// This adds server to the DeadServer processing list but not to the DeadServers list.
125125
// Server gets removed from processing list below on procedure successful finish.
126126
if (!notifiedDeadServer) {
127-
services.getServerManager().getDeadServers().notifyServer(serverName);
127+
services.getServerManager().getDeadServers().processing(serverName);
128128
notifiedDeadServer = true;
129129
}
130130

0 commit comments

Comments
 (0)