-
Notifications
You must be signed in to change notification settings - Fork 3.4k
HBASE-22414 Interruption of moving regions in RSGroup will cause regi… #323
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 5 commits
a44b321
3471e7f
3586a3f
310bc21
27ecfc2
f9927e5
d12694c
e559550
cc9e0a0
ec50f4f
670ab0e
bdb68f5
3ed9621
9ca8b9d
963fba6
8bf26f4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -28,6 +28,7 @@ | |
| import java.util.Map; | ||
| import java.util.Set; | ||
| import org.apache.commons.lang3.StringUtils; | ||
| import org.apache.hadoop.hbase.DoNotRetryIOException; | ||
| import org.apache.hadoop.hbase.NamespaceDescriptor; | ||
| import org.apache.hadoop.hbase.ServerName; | ||
| import org.apache.hadoop.hbase.TableName; | ||
|
|
@@ -60,9 +61,13 @@ public class RSGroupAdminServer implements RSGroupAdmin { | |
| private MasterServices master; | ||
| private final RSGroupInfoManager rsGroupInfoManager; | ||
|
|
||
| private String FAILED_MOVE_MAX_RETRY = "hbase.rsgroup.move.max.retry"; | ||
| private int moveMaxRetry; | ||
|
|
||
| public RSGroupAdminServer(MasterServices master, RSGroupInfoManager rsGroupInfoManager) { | ||
| this.master = master; | ||
| this.rsGroupInfoManager = rsGroupInfoManager; | ||
| this.moveMaxRetry = master.getConfiguration().getInt(FAILED_MOVE_MAX_RETRY, 50); | ||
|
wchevreuil marked this conversation as resolved.
Outdated
|
||
| } | ||
|
|
||
| @Override | ||
|
|
@@ -193,76 +198,107 @@ private void checkServersAndTables(Set<Address> servers, Set<TableName> tables, | |
| } | ||
| } | ||
|
|
||
| private enum MoveType { | ||
| TO, FROM | ||
| } | ||
|
|
||
| /** | ||
| * Move every region from servers which are currently located on these servers, | ||
| * but should not be located there. | ||
| * When move a table to a group, we can only move regions of it which are not on group servers | ||
| * TO the group. Because all regions of the table must be in target group. When move a server to | ||
| * a group, we can only move regions on it which do not belong to group tables FROM the group. | ||
| * And what's more, table regions or server regions that already in target group need not to be | ||
| * moved. So MoveType.TO means move regions of TABLES, and MoveType.FROM means move regions on | ||
| * SERVERS. | ||
| * | ||
| * @param servers the servers that will move to new group | ||
| * @param targetGroupName the target group name | ||
| * @throws IOException if moving the server and tables fail | ||
| * @param set it's a table set or a server set | ||
| * @param targetGroupName target group name | ||
| * @param type type of move regions | ||
| * @param <T> the type of elements in Set | ||
| * @throws IOException if move haven't succeed even after max number of retries | ||
| */ | ||
| private void moveServerRegionsFromGroup(Set<Address> servers, String targetGroupName) | ||
| private <T> void moveRegionsToOrFromGroup(Set<T> set, String targetGroupName, MoveType type) | ||
| throws IOException { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this really a good idea to group "TO and FROM" into one method? It's an open question, just want to hear some reasoning from you. thanks
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think added To/From to method name makes codes more easy to be understood. |
||
| Set<T> newSet = new HashSet<>(set); | ||
| boolean hasRegionsToMove; | ||
| int retry = 0; | ||
| IOException toThrow = null; | ||
| Set<String> failedRegions = new HashSet<>(); | ||
| RSGroupInfo targetGrp = getRSGroupInfo(targetGroupName); | ||
| Set<Address> allSevers = new HashSet<>(servers); | ||
| do { | ||
| hasRegionsToMove = false; | ||
| for (Iterator<Address> iter = allSevers.iterator(); iter.hasNext();) { | ||
| Address rs = iter.next(); | ||
| // Get regions that are associated with this server and filter regions by group tables. | ||
| for (RegionInfo region : getRegions(rs)) { | ||
| if (!targetGrp.containsTable(region.getTable())) { | ||
| LOG.info("Moving server region {}, which do not belong to RSGroup {}", | ||
| region.getShortNameToLog(), targetGroupName); | ||
| this.master.getAssignmentManager().move(region); | ||
| if (master.getAssignmentManager().getRegionStates(). | ||
| getRegionState(region).isFailedOpen()) { | ||
| continue; | ||
| for (Iterator<T> iter = newSet.iterator(); iter.hasNext(); ) { | ||
| T element = iter.next(); | ||
| List<RegionInfo> toMoveRegions = new ArrayList<>(); | ||
| if (type == MoveType.TO) { | ||
| // means element type of set is TableName | ||
| assert element instanceof TableName; | ||
| if (master.getAssignmentManager().isTableDisabled((TableName) element)) { | ||
| LOG.debug("Skipping move regions because the table {} is disabled", element); | ||
| }else { | ||
| // Get regions of these tables and filter regions by group servers. | ||
| for (RegionInfo region : master.getAssignmentManager().getRegionStates() | ||
| .getRegionsOfTable((TableName) element)) { | ||
| ServerName sn = | ||
| master.getAssignmentManager().getRegionStates().getRegionServerOfRegion(region); | ||
| if (!targetGrp.containsServer(sn.getAddress())) { | ||
| toMoveRegions.add(region); | ||
| } | ||
| } | ||
| } | ||
| } | ||
| if (type == MoveType.FROM) { | ||
| // means element type of set is Address | ||
| assert element instanceof Address; | ||
| // Get regions that are associated with these servers and filter regions by group tables. | ||
| for (RegionInfo region : getRegions((Address) element)) { | ||
| if (!targetGrp.containsTable(region.getTable())) { | ||
| toMoveRegions.add(region); | ||
| } | ||
| hasRegionsToMove = true; | ||
| } | ||
| } | ||
|
|
||
| //move regions | ||
| for (RegionInfo region : toMoveRegions) { | ||
| LOG.info("Moving server region {}, which do not belong to RSGroup {}", | ||
| region.getShortNameToLog(), targetGroupName); | ||
| try { | ||
| this.master.getAssignmentManager().move(region); | ||
| failedRegions.remove(region.getRegionNameAsString()); | ||
| } catch (IOException ioe) { | ||
| LOG.debug("Move region {} from group failed, will retry, current retry time is {}", | ||
| region.getShortNameToLog(), retry, ioe); | ||
| toThrow = ioe; | ||
| failedRegions.add(region.getRegionNameAsString()); | ||
| } | ||
| if (master.getAssignmentManager().getRegionStates(). | ||
| getRegionState(region).isFailedOpen()) { | ||
| continue; | ||
| } | ||
| hasRegionsToMove = true; | ||
| } | ||
| if (!hasRegionsToMove) { | ||
| LOG.info("Server {} has no more regions to move for RSGroup", rs.getHostname()); | ||
| LOG.info("There are no more regions of {} to move for RSGroup {}", element, | ||
| targetGroupName); | ||
| iter.remove(); | ||
| } | ||
| } | ||
| retry++; | ||
| try { | ||
| rsGroupInfoManager.wait(1000); | ||
| } catch (InterruptedException e) { | ||
| LOG.warn("Sleep interrupted", e); | ||
| Thread.currentThread().interrupt(); | ||
| } | ||
| } while (hasRegionsToMove); | ||
| } | ||
| } while (hasRegionsToMove && retry <= moveMaxRetry); | ||
|
wchevreuil marked this conversation as resolved.
|
||
|
|
||
| /** | ||
| * Moves regions of tables which are not on target group servers. | ||
| * | ||
| * @param tables the tables that will move to new group | ||
| * @param targetGroupName the target group name | ||
| * @throws IOException if moving the region fails | ||
| */ | ||
| private void moveTableRegionsToGroup(Set<TableName> tables, String targetGroupName) | ||
| throws IOException { | ||
| RSGroupInfo targetGrp = getRSGroupInfo(targetGroupName); | ||
| for (TableName table : tables) { | ||
| if (master.getAssignmentManager().isTableDisabled(table)) { | ||
| LOG.debug("Skipping move regions because the table {} is disabled", table); | ||
| continue; | ||
| } | ||
| LOG.info("Moving region(s) for table {} to RSGroup {}", table, targetGroupName); | ||
| for (RegionInfo region : master.getAssignmentManager().getRegionStates() | ||
| .getRegionsOfTable(table)) { | ||
| ServerName sn = | ||
| master.getAssignmentManager().getRegionStates().getRegionServerOfRegion(region); | ||
| if (!targetGrp.containsServer(sn.getAddress())) { | ||
| LOG.info("Moving region {} to RSGroup {}", region.getShortNameToLog(), targetGroupName); | ||
| master.getAssignmentManager().move(region); | ||
| } | ||
| } | ||
| //has up to max retry time or there are no more regions to move | ||
| if (hasRegionsToMove) { | ||
| // print failed moved regions, for later process conveniently | ||
| String msg = String.format("move regions for group %s failed, failed regions: %s", | ||
| targetGroupName, failedRegions); | ||
| LOG.error(msg); | ||
| throw new DoNotRetryIOException(msg + | ||
| ", just record the last failed region's cause, more details in server log", toThrow); | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -322,7 +358,7 @@ public void moveServers(Set<Address> servers, String targetGroupName) | |
| // MovedServers may be < passed in 'servers'. | ||
| Set<Address> movedServers = rsGroupInfoManager.moveServers(servers, srcGrp.getName(), | ||
| targetGroupName); | ||
| moveServerRegionsFromGroup(movedServers, targetGroupName); | ||
| moveRegionsToOrFromGroup(movedServers, targetGroupName, MoveType.FROM); | ||
| LOG.info("Move servers done: {} => {}", srcGrp.getName(), targetGroupName); | ||
| } | ||
| } | ||
|
|
@@ -364,7 +400,7 @@ public void moveTables(Set<TableName> tables, String targetGroup) throws IOExcep | |
| // targetGroup is null when a table is being deleted. In this case no further | ||
| // action is required. | ||
| if (targetGroup != null) { | ||
| moveTableRegionsToGroup(tables, targetGroup); | ||
| moveRegionsToOrFromGroup(tables, targetGroup, MoveType.TO); | ||
| } | ||
| } | ||
| } | ||
|
|
@@ -496,9 +532,9 @@ public void moveServersAndTables(Set<Address> servers, Set<TableName> tables, St | |
| rsGroupInfoManager.moveServersAndTables(servers, tables, srcGroup, targetGroup); | ||
|
|
||
| //move regions on these servers which do not belong to group tables | ||
| moveServerRegionsFromGroup(servers, targetGroup); | ||
| moveRegionsToOrFromGroup(servers, targetGroup, MoveType.FROM); | ||
| //move regions of these tables which are not on group servers | ||
| moveTableRegionsToGroup(tables, targetGroup); | ||
| moveRegionsToOrFromGroup(tables, targetGroup, MoveType.TO); | ||
| } | ||
| LOG.info("Move servers and tables done. Severs: {}, Tables: {} => {}", servers, tables, | ||
| targetGroup); | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.