Skip to content

Commit 6008ffe

Browse files
ddupginfraio
authored andcommitted
HBASE-24684 Fetch ReplicationSink servers list from HMaster instead o… (#2077)
Signed-off-by: Wellington Chevreuil <wchevreuil@apache.org>
1 parent 8f8620d commit 6008ffe

13 files changed

Lines changed: 334 additions & 16 deletions

File tree

hbase-protocol-shaded/src/main/protobuf/server/master/Master.proto

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -717,6 +717,13 @@ message BalancerDecisionsResponse {
717717
repeated BalancerDecision balancer_decision = 1;
718718
}
719719

720+
message ListReplicationSinkServersRequest {
721+
}
722+
723+
message ListReplicationSinkServersResponse {
724+
repeated ServerName server_name = 1;
725+
}
726+
720727
service MasterService {
721728
/** Used by the client to get the number of regions that have received the updated schema */
722729
rpc GetSchemaAlterStatus(GetSchemaAlterStatusRequest)
@@ -1146,10 +1153,13 @@ service MasterService {
11461153
returns (RenameRSGroupResponse);
11471154

11481155
rpc UpdateRSGroupConfig(UpdateRSGroupConfigRequest)
1149-
returns (UpdateRSGroupConfigResponse);
1156+
returns (UpdateRSGroupConfigResponse);
11501157

11511158
rpc GetLogEntries(LogRequest)
11521159
returns(LogEntry);
1160+
1161+
rpc ListReplicationSinkServers(ListReplicationSinkServersRequest)
1162+
returns (ListReplicationSinkServersResponse);
11531163
}
11541164

11551165
// HBCK Service definitions.

hbase-server/src/main/java/org/apache/hadoop/hbase/coprocessor/MasterObserver.java

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1782,4 +1782,20 @@ default void preHasUserPermissions(ObserverContext<MasterCoprocessorEnvironment>
17821782
default void postHasUserPermissions(ObserverContext<MasterCoprocessorEnvironment> ctx,
17831783
String userName, List<Permission> permissions) throws IOException {
17841784
}
1785+
1786+
/**
1787+
* Called before getting servers for replication sink.
1788+
* @param ctx the coprocessor instance's environment
1789+
*/
1790+
default void preListReplicationSinkServers(ObserverContext<MasterCoprocessorEnvironment> ctx)
1791+
throws IOException {
1792+
}
1793+
1794+
/**
1795+
* Called after getting servers for replication sink.
1796+
* @param ctx the coprocessor instance's environment
1797+
*/
1798+
default void postListReplicationSinkServers(ObserverContext<MasterCoprocessorEnvironment> ctx)
1799+
throws IOException {
1800+
}
17851801
}

hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3900,4 +3900,9 @@ public MetaRegionLocationCache getMetaRegionLocationCache() {
39003900
public RSGroupInfoManager getRSGroupInfoManager() {
39013901
return rsGroupInfoManager;
39023902
}
3903+
3904+
@Override
3905+
public List<ServerName> listReplicationSinkServers() throws IOException {
3906+
return this.serverManager.getOnlineServersList();
3907+
}
39033908
}

hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterCoprocessorHost.java

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2038,4 +2038,22 @@ public void call(MasterObserver observer) throws IOException {
20382038
}
20392039
});
20402040
}
2041+
2042+
public void preListReplicationSinkServers() throws IOException {
2043+
execOperation(coprocEnvironments.isEmpty() ? null : new MasterObserverOperation() {
2044+
@Override
2045+
public void call(MasterObserver observer) throws IOException {
2046+
observer.preListReplicationSinkServers(this);
2047+
}
2048+
});
2049+
}
2050+
2051+
public void postListReplicationSinkServers() throws IOException {
2052+
execOperation(coprocEnvironments.isEmpty() ? null : new MasterObserverOperation() {
2053+
@Override
2054+
public void call(MasterObserver observer) throws IOException {
2055+
observer.postListReplicationSinkServers(this);
2056+
}
2057+
});
2058+
}
20412059
}

hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterRpcServices.java

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -263,6 +263,8 @@
263263
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.ListNamespaceDescriptorsResponse;
264264
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.ListNamespacesRequest;
265265
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.ListNamespacesResponse;
266+
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.ListReplicationSinkServersRequest;
267+
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.ListReplicationSinkServersResponse;
266268
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.ListTableDescriptorsByNamespaceRequest;
267269
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.ListTableDescriptorsByNamespaceResponse;
268270
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.ListTableNamesByNamespaceRequest;
@@ -3375,4 +3377,23 @@ private MasterProtos.BalancerDecisionsResponse getBalancerDecisions(
33753377
.addAllBalancerDecision(balancerDecisions).build();
33763378
}
33773379

3380+
public ListReplicationSinkServersResponse listReplicationSinkServers(
3381+
RpcController controller, ListReplicationSinkServersRequest request)
3382+
throws ServiceException {
3383+
ListReplicationSinkServersResponse.Builder builder =
3384+
ListReplicationSinkServersResponse.newBuilder();
3385+
try {
3386+
if (master.getMasterCoprocessorHost() != null) {
3387+
master.getMasterCoprocessorHost().preListReplicationSinkServers();
3388+
}
3389+
builder.addAllServerName(master.listReplicationSinkServers().stream()
3390+
.map(ProtobufUtil::toServerName).collect(Collectors.toList()));
3391+
if (master.getMasterCoprocessorHost() != null) {
3392+
master.getMasterCoprocessorHost().postListReplicationSinkServers();
3393+
}
3394+
} catch (IOException e) {
3395+
throw new ServiceException(e);
3396+
}
3397+
return builder.build();
3398+
}
33783399
}

hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterServices.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -553,4 +553,10 @@ default SplitWALManager getSplitWALManager(){
553553
* @return The state of the load balancer, or false if the load balancer isn't defined.
554554
*/
555555
boolean isBalancerOn();
556+
557+
/**
558+
* Get a list of servers' addresses for replication sink.
559+
* @return a list of servers' address
560+
*/
561+
List<ServerName> listReplicationSinkServers() throws IOException;
556562
}

hbase-server/src/main/java/org/apache/hadoop/hbase/replication/HBaseReplicationEndpoint.java

Lines changed: 133 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,9 @@
1818

1919
package org.apache.hadoop.hbase.replication;
2020

21+
import static org.apache.hadoop.hbase.HConstants.DEFAULT_HBASE_RPC_SHORTOPERATION_TIMEOUT;
22+
import static org.apache.hadoop.hbase.HConstants.HBASE_RPC_SHORTOPERATION_TIMEOUT_KEY;
23+
2124
import java.io.IOException;
2225
import java.util.ArrayList;
2326
import java.util.Collections;
@@ -28,21 +31,26 @@
2831

2932
import org.apache.hadoop.conf.Configuration;
3033
import org.apache.hadoop.fs.Path;
34+
import org.apache.hadoop.hbase.Abortable;
3135
import org.apache.hadoop.hbase.HBaseConfiguration;
36+
import org.apache.hadoop.hbase.ChoreService;
3237
import org.apache.hadoop.hbase.client.AsyncClusterConnection;
3338
import org.apache.hadoop.hbase.client.AsyncRegionServerAdmin;
3439
import org.apache.hadoop.hbase.client.AsyncReplicationServerAdmin;
3540
import org.apache.hadoop.hbase.client.ClusterConnectionFactory;
3641
import org.apache.hadoop.hbase.protobuf.ReplicationProtobufUtil;
42+
import org.apache.hadoop.hbase.ScheduledChore;
43+
import org.apache.hadoop.hbase.Server;
44+
import org.apache.hadoop.hbase.ServerName;
3745
import org.apache.hadoop.hbase.security.User;
46+
import org.apache.hadoop.hbase.security.UserProvider;
47+
import org.apache.hadoop.hbase.util.FutureUtils;
3848
import org.apache.hadoop.hbase.wal.WAL;
39-
import org.apache.hadoop.hbase.zookeeper.ZKListener;
40-
import org.apache.yetus.audience.InterfaceAudience;
41-
import org.apache.hadoop.hbase.Abortable;
42-
import org.apache.hadoop.hbase.ServerName;
4349
import org.apache.hadoop.hbase.zookeeper.ZKClusterId;
50+
import org.apache.hadoop.hbase.zookeeper.ZKListener;
4451
import org.apache.hadoop.hbase.zookeeper.ZKUtil;
4552
import org.apache.hadoop.hbase.zookeeper.ZKWatcher;
53+
import org.apache.yetus.audience.InterfaceAudience;
4654
import org.apache.zookeeper.KeeperException;
4755
import org.apache.zookeeper.KeeperException.AuthFailedException;
4856
import org.apache.zookeeper.KeeperException.ConnectionLossException;
@@ -52,6 +60,12 @@
5260

5361
import org.apache.hbase.thirdparty.com.google.common.annotations.VisibleForTesting;
5462
import org.apache.hbase.thirdparty.com.google.common.collect.Maps;
63+
import org.apache.hbase.thirdparty.com.google.protobuf.ServiceException;
64+
65+
import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
66+
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.ListReplicationSinkServersRequest;
67+
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.ListReplicationSinkServersResponse;
68+
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.MasterService;
5569

5670
/**
5771
* A {@link BaseReplicationEndpoint} for replication endpoints whose
@@ -63,6 +77,13 @@ public abstract class HBaseReplicationEndpoint extends BaseReplicationEndpoint
6377

6478
private static final Logger LOG = LoggerFactory.getLogger(HBaseReplicationEndpoint.class);
6579

80+
public static final String FETCH_SERVERS_USE_ZK_CONF_KEY =
81+
"hbase.replication.fetch.servers.usezk";
82+
83+
public static final String FETCH_SERVERS_INTERVAL_CONF_KEY =
84+
"hbase.replication.fetch.servers.interval";
85+
public static final int DEFAULT_FETCH_SERVERS_INTERVAL = 10 * 60 * 1000; // 10 mins
86+
6687
private ZKWatcher zkw = null;
6788

6889
protected Configuration conf;
@@ -93,6 +114,11 @@ public abstract class HBaseReplicationEndpoint extends BaseReplicationEndpoint
93114

94115
private List<ServerName> sinkServers = new ArrayList<>(0);
95116

117+
private AsyncClusterConnection peerConnection;
118+
private boolean fetchServersUseZk = false;
119+
private FetchServersChore fetchServersChore;
120+
private int shortOperationTimeout;
121+
96122
/*
97123
* Some implementations of HBaseInterClusterReplicationEndpoint may require instantiate different
98124
* Connection implementations, or initialize it in a different way, so defining createConnection
@@ -122,6 +148,19 @@ protected synchronized void disconnect() {
122148
if (zkw != null) {
123149
zkw.close();
124150
}
151+
if (fetchServersChore != null) {
152+
ChoreService choreService = ctx.getServer().getChoreService();
153+
if (null != choreService) {
154+
choreService.cancelChore(fetchServersChore);
155+
}
156+
}
157+
if (peerConnection != null) {
158+
try {
159+
peerConnection.close();
160+
} catch (IOException e) {
161+
LOG.warn("Attempt to close peerConnection failed.", e);
162+
}
163+
}
125164
}
126165

127166
/**
@@ -152,8 +191,27 @@ public void stop() {
152191
}
153192

154193
@Override
155-
protected void doStart() {
194+
protected synchronized void doStart() {
195+
this.shortOperationTimeout = ctx.getLocalConfiguration().getInt(
196+
HBASE_RPC_SHORTOPERATION_TIMEOUT_KEY, DEFAULT_HBASE_RPC_SHORTOPERATION_TIMEOUT);
156197
try {
198+
if (ctx.getLocalConfiguration().getBoolean(FETCH_SERVERS_USE_ZK_CONF_KEY, false)) {
199+
fetchServersUseZk = true;
200+
} else {
201+
try {
202+
if (ReplicationUtils.isPeerClusterSupportReplicationOffload(getPeerConnection())) {
203+
fetchServersChore = new FetchServersChore(ctx.getServer(), this);
204+
ctx.getServer().getChoreService().scheduleChore(fetchServersChore);
205+
fetchServersUseZk = false;
206+
} else {
207+
fetchServersUseZk = true;
208+
}
209+
} catch (Throwable t) {
210+
fetchServersUseZk = true;
211+
LOG.warn("Peer {} try to fetch servers by admin failed. Using zk impl.",
212+
ctx.getPeerId(), t);
213+
}
214+
}
157215
reloadZkWatcher();
158216
notifyStarted();
159217
} catch (IOException e) {
@@ -192,7 +250,9 @@ private synchronized void reloadZkWatcher() throws IOException {
192250
}
193251
zkw = new ZKWatcher(ctx.getConfiguration(),
194252
"connection to cluster: " + ctx.getPeerId(), this);
195-
zkw.registerListener(new PeerRegionServerListener(this));
253+
if (fetchServersUseZk) {
254+
zkw.registerListener(new PeerRegionServerListener(this));
255+
}
196256
}
197257

198258
@Override
@@ -207,12 +267,47 @@ public boolean isAborted() {
207267
return false;
208268
}
209269

270+
/**
271+
* Get the connection to peer cluster
272+
* @return connection to peer cluster
273+
* @throws IOException If anything goes wrong connecting
274+
*/
275+
private synchronized AsyncClusterConnection getPeerConnection() throws IOException {
276+
if (peerConnection == null) {
277+
Configuration conf = ctx.getConfiguration();
278+
peerConnection = ClusterConnectionFactory.createAsyncClusterConnection(conf, null,
279+
UserProvider.instantiate(conf).getCurrent());
280+
}
281+
return peerConnection;
282+
}
283+
284+
/**
285+
* Get the list of all the servers that are responsible for replication sink
286+
* from the specified peer master
287+
* @return list of server addresses or an empty list if the slave is unavailable
288+
*/
289+
protected List<ServerName> fetchSlavesAddresses() {
290+
try {
291+
AsyncClusterConnection peerConn = getPeerConnection();
292+
ServerName master = FutureUtils.get(peerConn.getAdmin().getMaster());
293+
MasterService.BlockingInterface masterStub = MasterService.newBlockingStub(
294+
peerConn.getRpcClient()
295+
.createBlockingRpcChannel(master, User.getCurrent(), shortOperationTimeout));
296+
ListReplicationSinkServersResponse resp = masterStub
297+
.listReplicationSinkServers(null, ListReplicationSinkServersRequest.newBuilder().build());
298+
return ProtobufUtil.toServerNameList(resp.getServerNameList());
299+
} catch (ServiceException | IOException e) {
300+
LOG.error("Peer {} fetches servers failed", ctx.getPeerId(), e);
301+
}
302+
return Collections.emptyList();
303+
}
304+
210305
/**
211306
* Get the list of all the region servers from the specified peer
212307
*
213308
* @return list of region server addresses or an empty list if the slave is unavailable
214309
*/
215-
protected List<ServerName> fetchSlavesAddresses() {
310+
protected List<ServerName> fetchSlavesAddressesByZK() {
216311
List<String> children = null;
217312
try {
218313
children = ZKUtil.listChildrenAndWatchForNewChildren(zkw, zkw.getZNodePaths().rsZNode);
@@ -233,7 +328,12 @@ protected List<ServerName> fetchSlavesAddresses() {
233328
}
234329

235330
protected synchronized void chooseSinks() {
236-
List<ServerName> slaveAddresses = fetchSlavesAddresses();
331+
List<ServerName> slaveAddresses = Collections.emptyList();
332+
if (fetchServersUseZk) {
333+
slaveAddresses = fetchSlavesAddressesByZK();
334+
} else {
335+
slaveAddresses = fetchSlavesAddresses();
336+
}
237337
if (slaveAddresses.isEmpty()) {
238338
LOG.warn("No sinks available at peer. Will not be able to replicate");
239339
}
@@ -264,6 +364,14 @@ protected synchronized SinkPeer getReplicationSink() throws IOException {
264364
return createSinkPeer(serverName);
265365
}
266366

367+
private SinkPeer createSinkPeer(ServerName serverName) throws IOException {
368+
if (ReplicationUtils.isPeerClusterSupportReplicationOffload(conn)) {
369+
return new ReplicationServerSinkPeer(serverName, conn.getReplicationServerAdmin(serverName));
370+
} else {
371+
return new RegionServerSinkPeer(serverName, conn.getRegionServerAdmin(serverName));
372+
}
373+
}
374+
267375
/**
268376
* Report a {@code SinkPeer} as being bad (i.e. an attempt to replicate to it
269377
* failed). If a single SinkPeer is reported as bad more than
@@ -373,11 +481,23 @@ public void replicateWALEntry(WAL.Entry[] entries, String replicationClusterId,
373481
}
374482
}
375483

376-
private SinkPeer createSinkPeer(ServerName serverName) throws IOException {
377-
if (ReplicationUtils.isPeerClusterSupportReplicationOffload(conn)) {
378-
return new ReplicationServerSinkPeer(serverName, conn.getReplicationServerAdmin(serverName));
379-
} else {
380-
return new RegionServerSinkPeer(serverName, conn.getRegionServerAdmin(serverName));
484+
/**
485+
* Chore that will fetch the list of servers from peer master.
486+
*/
487+
public static class FetchServersChore extends ScheduledChore {
488+
489+
private HBaseReplicationEndpoint endpoint;
490+
491+
public FetchServersChore(Server server, HBaseReplicationEndpoint endpoint) {
492+
super("Peer-" + endpoint.ctx.getPeerId() + "-FetchServersChore", server,
493+
server.getConfiguration()
494+
.getInt(FETCH_SERVERS_INTERVAL_CONF_KEY, DEFAULT_FETCH_SERVERS_INTERVAL));
495+
this.endpoint = endpoint;
496+
}
497+
498+
@Override
499+
protected void chore() {
500+
endpoint.chooseSinks();
381501
}
382502
}
383503
}

hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSource.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -343,9 +343,9 @@ private void tryStartNewShipper(String walGroupId, PriorityBlockingQueue<Path> q
343343
Threads.setDaemonThreadRunning(
344344
walReader, Thread.currentThread().getName()
345345
+ ".replicationSource.wal-reader." + walGroupId + "," + queueId,
346-
(t,e) -> this.uncaughtException(t, e, this.manager, this.getPeerId()));
346+
(t,e) -> this.uncaughtException(t, e, null, this.getPeerId()));
347347
worker.setWALReader(walReader);
348-
worker.startup((t,e) -> this.uncaughtException(t, e, this.manager, this.getPeerId()));
348+
worker.startup((t,e) -> this.uncaughtException(t, e, null, this.getPeerId()));
349349
return worker;
350350
}
351351
});

hbase-server/src/test/java/org/apache/hadoop/hbase/master/MockNoopMasterServices.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -507,4 +507,9 @@ public RSGroupInfoManager getRSGroupInfoManager() {
507507
public boolean isBalancerOn() {
508508
return false;
509509
}
510+
511+
@Override
512+
public List<ServerName> listReplicationSinkServers() throws IOException {
513+
return null;
514+
}
510515
}

0 commit comments

Comments
 (0)