1- /**
2- *
1+ /*
32 * Licensed to the Apache Software Foundation (ASF) under one
43 * or more contributor license agreements. See the NOTICE file
54 * distributed with this work for additional information
2019
2120import java .util .ArrayList ;
2221import java .util .Collections ;
23- import java .util .Comparator ;
2422import java .util .Date ;
2523import java .util .HashMap ;
2624import java .util .HashSet ;
4038
4139/**
4240 * Class to hold dead servers list and utility querying dead server list.
43- * On znode expiration, servers are added here.
41+ * Servers are added when they expire or when we find them in filesystem on startup.
42+ * When a server crash procedure is queued, it will populate the processing list and
43+ * then remove the server from processing list when done. Servers are removed from
44+ * dead server list when a new instance is started over the old on same hostname and
45+ * port or when new Master comes online tidying up after all initialization. Processing
46+ * list and deadserver list are not tied together (you don't have to be in deadservers
47+ * list to be processing and vice versa).
4448 */
4549@ InterfaceAudience .Private
4650public class DeadServer {
@@ -56,37 +60,11 @@ public class DeadServer {
5660 private final Map <ServerName , Long > deadServers = new HashMap <>();
5761
5862 /**
59- * Set of dead servers currently being processed
60- */
61- private final Set <ServerName > processingServers = new HashSet <ServerName >();
62-
63- /**
64- * Handles restart of a server. The new server instance has a different start code.
65- * The new start code should be greater than the old one. We don't check that here.
66- *
67- * @param newServerName Servername as either <code>host:port</code> or
68- * <code>host,port,startcode</code>.
69- * @return true if this server was dead before and coming back alive again
63+ * Set of dead servers currently being processed by a SCP.
64+ * Added to this list at the start of SCP and removed after it is done
65+ * processing the crash.
7066 */
71- public synchronized boolean cleanPreviousInstance (final ServerName newServerName ) {
72- Iterator <ServerName > it = deadServers .keySet ().iterator ();
73- while (it .hasNext ()) {
74- ServerName sn = it .next ();
75- if (ServerName .isSameAddress (sn , newServerName )) {
76- // remove from deadServers
77- it .remove ();
78- // remove from processingServers
79- boolean removed = processingServers .remove (sn );
80- if (removed ) {
81- LOG .debug ("Removed {}, processing={}, numProcessing={}" , sn , removed ,
82- processingServers .size ());
83- }
84- return true ;
85- }
86- }
87-
88- return false ;
89- }
67+ private final Set <ServerName > processingServers = new HashSet <>();
9068
9169 /**
9270 * @param serverName server name.
@@ -96,22 +74,14 @@ public synchronized boolean isDeadServer(final ServerName serverName) {
9674 return deadServers .containsKey (serverName );
9775 }
9876
99- /**
100- * @param serverName server name.
101- * @return true if this server is on the processing servers list false otherwise
102- */
103- public synchronized boolean isProcessingServer (final ServerName serverName ) {
104- return processingServers .contains (serverName );
105- }
106-
10777 /**
10878 * Checks if there are currently any dead servers being processed by the
10979 * master. Returns true if at least one region server is currently being
11080 * processed as dead.
11181 *
11282 * @return true if any RS are being processed as dead
11383 */
114- public synchronized boolean areDeadServersInProgress () {
84+ synchronized boolean areDeadServersInProgress () {
11585 return !processingServers .isEmpty ();
11686 }
11787
@@ -124,72 +94,90 @@ public synchronized Set<ServerName> copyServerNames() {
12494 /**
12595 * Adds the server to the dead server list if it's not there already.
12696 */
127- public synchronized void add (ServerName sn ) {
128- if (!deadServers .containsKey (sn )){
129- deadServers .put (sn , EnvironmentEdgeManager .currentTime ());
130- }
131- boolean added = processingServers .add (sn );
132- if (LOG .isDebugEnabled () && added ) {
133- LOG .debug ("Added " + sn + "; numProcessing=" + processingServers .size ());
134- }
97+ synchronized void putIfAbsent (ServerName sn ) {
98+ this .deadServers .putIfAbsent (sn , EnvironmentEdgeManager .currentTime ());
99+ processing (sn );
135100 }
136101
137102 /**
138- * Notify that we started processing this dead server .
139- * @param sn ServerName for the dead server.
103+ * Add <code>sn<</code> to set of processing deadservers .
104+ * @see #finish( ServerName)
140105 */
141- public synchronized void notifyServer (ServerName sn ) {
142- boolean added = processingServers .add (sn );
143- if (LOG .isDebugEnabled ()) {
144- if (added ) {
145- LOG .debug ("Added " + sn + "; numProcessing=" + processingServers .size ());
146- }
147- LOG .debug ("Started processing " + sn + "; numProcessing=" + processingServers .size ());
106+ public synchronized void processing (ServerName sn ) {
107+ if (processingServers .add (sn )) {
108+ // Only log on add.
109+ LOG .debug ("Processing {}; numProcessing={}" , sn , processingServers .size ());
148110 }
149111 }
150112
151113 /**
152114 * Complete processing for this dead server.
153115 * @param sn ServerName for the dead server.
116+ * @see #processing(ServerName)
154117 */
155118 public synchronized void finish (ServerName sn ) {
156- boolean removed = processingServers .remove (sn );
157- if (LOG .isDebugEnabled ()) {
158- LOG .debug ("Finished processing " + sn + "; numProcessing=" + processingServers .size ());
159- if (removed ) {
160- LOG .debug ("Removed " + sn + " ; numProcessing=" + processingServers .size ());
161- }
119+ if (processingServers .remove (sn )) {
120+ LOG .debug ("Removed {} from processing; numProcessing={}" , sn , processingServers .size ());
162121 }
163122 }
164123
165124 public synchronized int size () {
166125 return deadServers .size ();
167126 }
168127
169- public synchronized boolean isEmpty () {
128+ synchronized boolean isEmpty () {
170129 return deadServers .isEmpty ();
171130 }
172131
173- public synchronized void cleanAllPreviousInstances (final ServerName newServerName ) {
132+ /**
133+ * Handles restart of a server. The new server instance has a different start code.
134+ * The new start code should be greater than the old one. We don't check that here.
135+ * Removes the old server from deadserver list.
136+ *
137+ * @param newServerName Servername as either <code>host:port</code> or
138+ * <code>host,port,startcode</code>.
139+ * @return true if this server was dead before and coming back alive again
140+ */
141+ synchronized boolean cleanPreviousInstance (final ServerName newServerName ) {
174142 Iterator <ServerName > it = deadServers .keySet ().iterator ();
175143 while (it .hasNext ()) {
176- ServerName sn = it .next ();
177- if (ServerName .isSameAddress (sn , newServerName )) {
178- // remove from deadServers
179- it .remove ();
180- // remove from processingServers
181- boolean removed = processingServers .remove (sn );
182- if (removed ) {
183- LOG .debug ("Removed " + sn + " ; numProcessing=" + processingServers .size ());
184- }
144+ if (cleanOldServerName (newServerName , it )) {
145+ return true ;
185146 }
186147 }
148+ return false ;
149+ }
150+
151+ synchronized void cleanAllPreviousInstances (final ServerName newServerName ) {
152+ Iterator <ServerName > it = deadServers .keySet ().iterator ();
153+ while (it .hasNext ()) {
154+ cleanOldServerName (newServerName , it );
155+ }
156+ }
157+
158+ /**
159+ * @param newServerName Server to match port and hostname against.
160+ * @param deadServerIterator Iterator primed so can call 'next' on it.
161+ * @return True if <code>newServerName</code> and current primed
162+ * iterator ServerName have same host and port and we removed old server
163+ * from iterator and from processing list.
164+ */
165+ private boolean cleanOldServerName (ServerName newServerName ,
166+ Iterator <ServerName > deadServerIterator ) {
167+ ServerName sn = deadServerIterator .next ();
168+ if (ServerName .isSameAddress (sn , newServerName )) {
169+ // Remove from dead servers list. Don't remove from the processing list --
170+ // let the SCP do it when it is done.
171+ deadServerIterator .remove ();
172+ return true ;
173+ }
174+ return false ;
187175 }
188176
189177 @ Override
190178 public synchronized String toString () {
191179 // Display unified set of servers from both maps
192- Set <ServerName > servers = new HashSet <ServerName >();
180+ Set <ServerName > servers = new HashSet <>();
193181 servers .addAll (deadServers .keySet ());
194182 servers .addAll (processingServers );
195183 StringBuilder sb = new StringBuilder ();
@@ -211,7 +199,7 @@ public synchronized String toString() {
211199 * @param ts the time, 0 for all
212200 * @return a sorted array list, by death time, lowest values first.
213201 */
214- public synchronized List <Pair <ServerName , Long >> copyDeadServersSince (long ts ){
202+ synchronized List <Pair <ServerName , Long >> copyDeadServersSince (long ts ) {
215203 List <Pair <ServerName , Long >> res = new ArrayList <>(size ());
216204
217205 for (Map .Entry <ServerName , Long > entry :deadServers .entrySet ()){
@@ -220,7 +208,7 @@ public synchronized List<Pair<ServerName, Long>> copyDeadServersSince(long ts){
220208 }
221209 }
222210
223- Collections .sort (res , ServerNameDeathDateComparator );
211+ Collections .sort (res , ( o1 , o2 ) -> o1 . getSecond (). compareTo ( o2 . getSecond ()) );
224212 return res ;
225213 }
226214
@@ -234,28 +222,15 @@ public synchronized Date getTimeOfDeath(final ServerName deadServerName){
234222 return time == null ? null : new Date (time );
235223 }
236224
237- private static Comparator <Pair <ServerName , Long >> ServerNameDeathDateComparator =
238- new Comparator <Pair <ServerName , Long >>(){
239-
240- @ Override
241- public int compare (Pair <ServerName , Long > o1 , Pair <ServerName , Long > o2 ) {
242- return o1 .getSecond ().compareTo (o2 .getSecond ());
243- }
244- };
245-
246225 /**
247- * remove the specified dead server
226+ * Called from rpc by operator cleaning up deadserver list.
248227 * @param deadServerName the dead server name
249228 * @return true if this server was removed
250229 */
251-
252230 public synchronized boolean removeDeadServer (final ServerName deadServerName ) {
253231 Preconditions .checkState (!processingServers .contains (deadServerName ),
254232 "Asked to remove server still in processingServers set " + deadServerName +
255233 " (numProcessing=" + processingServers .size () + ")" );
256- if (deadServers .remove (deadServerName ) == null ) {
257- return false ;
258- }
259- return true ;
234+ return this .deadServers .remove (deadServerName ) != null ;
260235 }
261236}
0 commit comments