-
Notifications
You must be signed in to change notification settings - Fork 3.4k
HBASE-22414 Interruption of moving regions in RSGroup will cause regi… #323
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
a44b321
3471e7f
3586a3f
310bc21
27ecfc2
f9927e5
d12694c
e559550
cc9e0a0
ec50f4f
670ab0e
bdb68f5
3ed9621
9ca8b9d
963fba6
8bf26f4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -204,6 +204,7 @@ private void checkServersAndTables(Set<Address> servers, Set<TableName> tables, | |
| private void moveServerRegionsFromGroup(Set<Address> servers, String targetGroupName) | ||
| throws IOException { | ||
| boolean hasRegionsToMove; | ||
| int retry = 0; | ||
| RSGroupInfo targetGrp = getRSGroupInfo(targetGroupName); | ||
| Set<Address> allSevers = new HashSet<>(servers); | ||
| do { | ||
|
|
@@ -215,7 +216,12 @@ private void moveServerRegionsFromGroup(Set<Address> servers, String targetGroup | |
| if (!targetGrp.containsTable(region.getTable())) { | ||
| LOG.info("Moving server region {}, which do not belong to RSGroup {}", | ||
| region.getShortNameToLog(), targetGroupName); | ||
| this.master.getAssignmentManager().move(region); | ||
| try { | ||
| this.master.getAssignmentManager().move(region); | ||
| }catch (IOException ioe){ | ||
| LOG.error("Move region {} from group failed, will retry, current retry time is {}", | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. May get too verbose. How about log as debug on each retry, then if max number of reties has been reached, log as error?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, I agree. But I think we need failed regions' names to move them to the target group servers after the failed call, because we can't call move tables or servers methods to move failed regions again.Maybe I can add a failed regions list in the ex message when max number of reties has been reached.What do you think about it?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have changed this log level to DEBUG, and added a failed regions list in the ex message when max number of reties has been reached. |
||
| region.getShortNameToLog(), retry, ioe); | ||
| } | ||
| if (master.getAssignmentManager().getRegionStates(). | ||
| getRegionState(region).isFailedOpen()) { | ||
| continue; | ||
|
|
@@ -229,13 +235,15 @@ private void moveServerRegionsFromGroup(Set<Address> servers, String targetGroup | |
| iter.remove(); | ||
| } | ||
| } | ||
|
|
||
| retry++; | ||
| try { | ||
| rsGroupInfoManager.wait(1000); | ||
| } catch (InterruptedException e) { | ||
| LOG.warn("Sleep interrupted", e); | ||
| Thread.currentThread().interrupt(); | ||
| } | ||
| } while (hasRegionsToMove); | ||
| } while (hasRegionsToMove && retry <= 50); | ||
|
sunhelly marked this conversation as resolved.
Outdated
wchevreuil marked this conversation as resolved.
Outdated
|
||
| } | ||
|
|
||
| /** | ||
|
|
@@ -247,23 +255,49 @@ private void moveServerRegionsFromGroup(Set<Address> servers, String targetGroup | |
| */ | ||
| private void moveTableRegionsToGroup(Set<TableName> tables, String targetGroupName) | ||
|
wchevreuil marked this conversation as resolved.
|
||
| throws IOException { | ||
| boolean hasRegionsToMove; | ||
| int retry = 0; | ||
| RSGroupInfo targetGrp = getRSGroupInfo(targetGroupName); | ||
| for (TableName table : tables) { | ||
| if (master.getAssignmentManager().isTableDisabled(table)) { | ||
| LOG.debug("Skipping move regions because the table {} is disabled", table); | ||
| continue; | ||
| } | ||
| LOG.info("Moving region(s) for table {} to RSGroup {}", table, targetGroupName); | ||
| for (RegionInfo region : master.getAssignmentManager().getRegionStates() | ||
| .getRegionsOfTable(table)) { | ||
| ServerName sn = | ||
| master.getAssignmentManager().getRegionStates().getRegionServerOfRegion(region); | ||
| if (!targetGrp.containsServer(sn.getAddress())) { | ||
| LOG.info("Moving region {} to RSGroup {}", region.getShortNameToLog(), targetGroupName); | ||
| master.getAssignmentManager().move(region); | ||
| Set<TableName> allTables = new HashSet<>(tables); | ||
| do { | ||
| hasRegionsToMove = false; | ||
| for (Iterator<TableName> iter = allTables.iterator(); iter.hasNext(); ) { | ||
| TableName table = iter.next(); | ||
| if (master.getAssignmentManager().isTableDisabled(table)) { | ||
| LOG.debug("Skipping move regions because the table {} is disabled", table); | ||
| continue; | ||
| } | ||
| LOG.info("Moving region(s) for table {} to RSGroup {}", table, targetGroupName); | ||
| for (RegionInfo region : master.getAssignmentManager().getRegionStates() | ||
| .getRegionsOfTable(table)) { | ||
| ServerName sn = | ||
| master.getAssignmentManager().getRegionStates().getRegionServerOfRegion(region); | ||
| if (!targetGrp.containsServer(sn.getAddress())) { | ||
| LOG.info("Moving region {} to RSGroup {}", region.getShortNameToLog(), targetGroupName); | ||
| try { | ||
| master.getAssignmentManager().move(region); | ||
| }catch (IOException ioe){ | ||
| LOG.error("Move region {} to group failed, will retry, current retry time is {}", | ||
| region.getShortNameToLog(), retry, ioe); | ||
| } | ||
| hasRegionsToMove = true; | ||
| } | ||
| } | ||
|
|
||
| if (!hasRegionsToMove) { | ||
| LOG.info("Table {} has no more regions to move for RSGroup", table.getNameAsString()); | ||
| iter.remove(); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| retry++; | ||
| try { | ||
| rsGroupInfoManager.wait(1000); | ||
| } catch (InterruptedException e) { | ||
| LOG.warn("Sleep interrupted", e); | ||
| Thread.currentThread().interrupt(); | ||
| } | ||
| } while (hasRegionsToMove && retry <= 50); | ||
|
wchevreuil marked this conversation as resolved.
Outdated
|
||
| } | ||
|
|
||
| @edu.umd.cs.findbugs.annotations.SuppressWarnings( | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,6 +17,7 @@ | |
| */ | ||
| package org.apache.hadoop.hbase.rsgroup; | ||
|
|
||
| import static org.apache.hadoop.hbase.util.Threads.sleep; | ||
| import static org.junit.Assert.assertEquals; | ||
| import static org.junit.Assert.assertFalse; | ||
| import static org.junit.Assert.assertTrue; | ||
|
|
@@ -29,15 +30,18 @@ | |
| import java.util.List; | ||
| import java.util.Map; | ||
| import java.util.Set; | ||
| import java.util.concurrent.atomic.AtomicBoolean; | ||
| import org.apache.hadoop.hbase.ClusterMetrics.Option; | ||
| import org.apache.hadoop.hbase.HBaseClassTestRule; | ||
| import org.apache.hadoop.hbase.ServerName; | ||
| import org.apache.hadoop.hbase.TableName; | ||
| import org.apache.hadoop.hbase.Waiter; | ||
| import org.apache.hadoop.hbase.client.RegionInfo; | ||
| import org.apache.hadoop.hbase.constraint.ConstraintException; | ||
| import org.apache.hadoop.hbase.master.RegionState; | ||
| import org.apache.hadoop.hbase.master.assignment.RegionStateNode; | ||
| import org.apache.hadoop.hbase.net.Address; | ||
| import org.apache.hadoop.hbase.testclassification.MediumTests; | ||
| import org.apache.hadoop.hbase.testclassification.LargeTests; | ||
| import org.apache.hadoop.hbase.util.Bytes; | ||
| import org.junit.After; | ||
| import org.junit.AfterClass; | ||
|
|
@@ -52,7 +56,7 @@ | |
|
|
||
| import org.apache.hbase.thirdparty.com.google.common.collect.Sets; | ||
|
|
||
| @Category({ MediumTests.class }) | ||
| @Category({ LargeTests.class }) | ||
| public class TestRSGroupsAdmin2 extends TestRSGroupsBase { | ||
|
|
||
| @ClassRule | ||
|
|
@@ -459,4 +463,197 @@ public boolean evaluate() throws Exception { | |
| Assert.assertEquals(null, rsGroupAdmin.getRSGroupInfo(fooGroup.getName())); | ||
| } | ||
|
|
||
| @Test | ||
| public void testFailedMoveWhenMoveServer() throws Exception { | ||
| final RSGroupInfo newGroup = addGroup(getGroupName(name.getMethodName()), 1); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. addGroup already invokes _ moveServerRegionsFromGroup_ indirectly, and adds an RS to it. We could simplify this test if we just create a new group from RSGroupAdminServer.addRSGroup, pick one of the RSes from assignMap, change one of its regions state, then call later call RSGroupAdminServer.moveServers?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, good idea.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In this case, I have used rsGroupAdmin.addRSGroup to add a new group without adding server. Then I create a multi-region table, choose a region to change state, and record the server that the region we changed on. In the last call RSGroupAdminServer.moveServers to move the recorded server, and check if the error message contains the region name. |
||
| final byte[] familyNameBytes = Bytes.toBytes("f"); | ||
| final int tableRegionCount = 10; | ||
| // All the regions created below will be assigned to the default group. | ||
| TEST_UTIL.createMultiRegionTable(tableName, familyNameBytes, tableRegionCount); | ||
| TEST_UTIL.waitFor(WAIT_TIMEOUT, new Waiter.Predicate<Exception>() { | ||
| @Override | ||
| public boolean evaluate() throws Exception { | ||
| List<String> regions = getTableRegionMap().get(tableName); | ||
| if (regions == null) { | ||
| return false; | ||
| } | ||
| return getTableRegionMap().get(tableName).size() >= tableRegionCount; | ||
| } | ||
| }); | ||
|
|
||
| // get target server to move, which should has more than one regions | ||
| // randomly set a region state to SPLITTING | ||
| Map<ServerName, List<String>> assignMap = getTableServerRegionMap().get(tableName); | ||
| String rregion = null; | ||
| ServerName toMoveServer = null; | ||
| for (ServerName server : assignMap.keySet()) { | ||
| rregion = assignMap.get(server).size() > 1 && !newGroup.containsServer(server.getAddress()) ? | ||
| assignMap.get(server).get(0) : | ||
| null; | ||
| if (rregion != null) { | ||
| toMoveServer = server; | ||
| break; | ||
| } | ||
| } | ||
| assert toMoveServer != null; | ||
| RegionInfo ri = TEST_UTIL.getMiniHBaseCluster().getMaster().getAssignmentManager(). | ||
| getRegionInfo(Bytes.toBytesBinary(rregion)); | ||
| RegionStateNode rsn = | ||
| TEST_UTIL.getMiniHBaseCluster().getMaster().getAssignmentManager().getRegionStates() | ||
| .getRegionStateNode(ri); | ||
| rsn.setState(RegionState.State.SPLITTING); | ||
|
|
||
| // start thread to recover region state | ||
| final ServerName movedServer = toMoveServer; | ||
| final String sregion = rregion; | ||
| AtomicBoolean changed = new AtomicBoolean(false); | ||
| Thread t1 = new Thread(() -> { | ||
|
wchevreuil marked this conversation as resolved.
Outdated
|
||
| LOG.debug("thread1 start running, will recover region state"); | ||
| long current = System.currentTimeMillis(); | ||
| while (System.currentTimeMillis() - current <= 50000) { | ||
|
wchevreuil marked this conversation as resolved.
Outdated
|
||
| List<RegionInfo> regions = master.getAssignmentManager().getRegionsOnServer(movedServer); | ||
| LOG.debug("server region size is:{}", regions.size()); | ||
| assert regions.size() >= 1; | ||
| // when there is exactly one region left, we can determine the move operation encountered | ||
| // exception caused by the strange region state. | ||
| if (regions.size() == 1) { | ||
| assertEquals(regions.get(0).getRegionNameAsString(), sregion); | ||
| rsn.setState(RegionState.State.OPEN); | ||
| LOG.info("set region {} state OPEN", sregion); | ||
| changed.set(true); | ||
| break; | ||
| } | ||
| sleep(5000); | ||
| } | ||
| }); | ||
| t1.start(); | ||
|
|
||
| // move target server to group | ||
| Thread t2 = new Thread(() -> { | ||
| LOG.info("thread2 start running, to move regions"); | ||
| try { | ||
| rsGroupAdmin.moveServers(Sets.newHashSet(movedServer.getAddress()), newGroup.getName()); | ||
| } catch (IOException e) { | ||
| LOG.error("move server error", e); | ||
| } | ||
| }); | ||
| t2.start(); | ||
|
|
||
| t1.join(); | ||
| t2.join(); | ||
|
|
||
| TEST_UTIL.waitFor(WAIT_TIMEOUT, new Waiter.Predicate<Exception>() { | ||
| @Override | ||
| public boolean evaluate() { | ||
| if (changed.get()) { | ||
| return master.getAssignmentManager().getRegionsOnServer(movedServer).size() == 0 && !rsn | ||
| .getRegionLocation().equals(movedServer); | ||
| } | ||
| return false; | ||
| } | ||
| }); | ||
| } | ||
|
|
||
| @Test | ||
| public void testFailedMoveWhenMoveTable() throws Exception { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same as to the tested methods, we may apply some code reuse, given how similarly the two test structures are?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I make a function named setARegionState to randomly choosing a region and set its state. The difference of the two UTs is at adding group and moving operation. Is this OK?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I will add a new skeleton function recoverRegionStateThread at the newly attached patch. |
||
| final RSGroupInfo newGroup = addGroup(getGroupName(name.getMethodName()), 1); | ||
| final byte[] familyNameBytes = Bytes.toBytes("f"); | ||
| final int tableRegionCount = 5; | ||
| // All the regions created below will be assigned to the default group. | ||
| TEST_UTIL.createMultiRegionTable(tableName, familyNameBytes, tableRegionCount); | ||
| TEST_UTIL.waitFor(WAIT_TIMEOUT, new Waiter.Predicate<Exception>() { | ||
| @Override | ||
| public boolean evaluate() throws Exception { | ||
| List<String> regions = getTableRegionMap().get(tableName); | ||
| if (regions == null) { | ||
| return false; | ||
| } | ||
| return getTableRegionMap().get(tableName).size() >= tableRegionCount; | ||
| } | ||
| }); | ||
|
|
||
| // randomly set a region state to SPLITTING | ||
| Map<ServerName, List<String>> assignMap = getTableServerRegionMap().get(tableName); | ||
| String rregion = null; | ||
| ServerName srcServer = null; | ||
| for (ServerName server : assignMap.keySet()) { | ||
| rregion = assignMap.get(server).size() >= 1 && !newGroup.containsServer(server.getAddress()) ? | ||
| assignMap.get(server).get(0) : | ||
| null; | ||
| if (rregion != null) { | ||
| srcServer = server; | ||
| break; | ||
| } | ||
| } | ||
| assert srcServer != null; | ||
| RegionInfo ri = TEST_UTIL.getMiniHBaseCluster().getMaster().getAssignmentManager(). | ||
| getRegionInfo(Bytes.toBytesBinary(rregion)); | ||
| RegionStateNode rsn = | ||
| TEST_UTIL.getMiniHBaseCluster().getMaster().getAssignmentManager().getRegionStates() | ||
| .getRegionStateNode(ri); | ||
| rsn.setState(RegionState.State.SPLITTING); | ||
|
|
||
| // move table to group | ||
| Thread t2 = new Thread(() -> { | ||
| LOG.info("thread2 start running, to move regions"); | ||
| try { | ||
| rsGroupAdmin.moveTables(Sets.newHashSet(tableName), newGroup.getName()); | ||
| } catch (IOException e) { | ||
| LOG.error("move server error", e); | ||
| } | ||
| }); | ||
| t2.start(); | ||
|
|
||
| // start thread to recover region state | ||
| final ServerName ss = srcServer; | ||
| final String sregion = rregion; | ||
| AtomicBoolean changed = new AtomicBoolean(false); | ||
| Thread t1 = new Thread(() -> { | ||
|
wchevreuil marked this conversation as resolved.
Outdated
|
||
| LOG.info("thread1 start running, will recover region state"); | ||
| long current = System.currentTimeMillis(); | ||
| while (System.currentTimeMillis() - current <= 50000) { | ||
|
wchevreuil marked this conversation as resolved.
Outdated
|
||
| List<RegionInfo> regions = master.getAssignmentManager().getRegionsOnServer(ss); | ||
| List<RegionInfo> tableRegions = new ArrayList<>(); | ||
| for (RegionInfo regionInfo : regions) { | ||
| if (regionInfo.getTable().equals(tableName)) { | ||
| tableRegions.add(regionInfo); | ||
| } | ||
| } | ||
| LOG.debug("server table region size is:{}", tableRegions.size()); | ||
| assert tableRegions.size() >= 1; | ||
| // when there is exactly one region left, we can determine the move operation encountered | ||
| // exception caused by the strange region state. | ||
| if (tableRegions.size() == 1) { | ||
| assertEquals(tableRegions.get(0).getRegionNameAsString(), sregion); | ||
| rsn.setState(RegionState.State.OPEN); | ||
| LOG.info("set region {} state OPEN", sregion); | ||
| changed.set(true); | ||
| break; | ||
| } | ||
| sleep(5000); | ||
| } | ||
| }); | ||
| t1.start(); | ||
|
|
||
| t1.join(); | ||
| t2.join(); | ||
|
|
||
| TEST_UTIL.waitFor(WAIT_TIMEOUT, new Waiter.Predicate<Exception>() { | ||
| @Override | ||
| public boolean evaluate() { | ||
| if (changed.get()) { | ||
| boolean serverHasTableRegions = false; | ||
| for (RegionInfo regionInfo : master.getAssignmentManager().getRegionsOnServer(ss)) { | ||
| if (regionInfo.getTable().equals(tableName)) { | ||
| serverHasTableRegions = true; | ||
| break; | ||
| } | ||
| } | ||
| return !serverHasTableRegions && !rsn.getRegionLocation().equals(ss); | ||
| } | ||
| return false; | ||
| } | ||
| }); | ||
| } | ||
|
|
||
| } | ||
|
wchevreuil marked this conversation as resolved.
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this really a good idea to group "TO and FROM" into one method? It's an open question, just want to hear some reasoning from you. thanks
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think added To/From to method name makes codes more easy to be understood.
And just like writing in the docs, TO means move tables regions, FROM means move server regions.