Skip to content

Commit 4a35e2f

Browse files
authored
HBASE-23383 [hbck2] fixHoles should queue assignment procedures for any regions its fixing (#917) (#1037)
The current process for an operator, after fixing holes in meta, is to manually disable and enable the whole table. Let's try to avoid bringing the whole table offline if we can. Have the master attempt to queue up assignment procedures for any new regions it creates. Signed-off-by: stack <stack@apache.org>
1 parent 4bf7fb8 commit 4a35e2f

2 files changed

Lines changed: 191 additions & 72 deletions

File tree

hbase-server/src/main/java/org/apache/hadoop/hbase/master/MetaFixer.java

Lines changed: 157 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -21,26 +21,23 @@
2121
import java.util.ArrayList;
2222
import java.util.Collections;
2323
import java.util.List;
24+
import java.util.Optional;
2425
import java.util.Set;
2526
import java.util.SortedSet;
2627
import java.util.TreeSet;
27-
28-
import org.apache.hadoop.conf.Configuration;
28+
import java.util.stream.Collectors;
2929
import org.apache.hadoop.hbase.HConstants;
3030
import org.apache.hadoop.hbase.MetaTableAccessor;
3131
import org.apache.hadoop.hbase.TableName;
3232
import org.apache.hadoop.hbase.client.RegionInfo;
3333
import org.apache.hadoop.hbase.client.RegionInfoBuilder;
3434
import org.apache.hadoop.hbase.exceptions.MergeRegionException;
35-
import org.apache.hadoop.hbase.regionserver.HRegion;
35+
import org.apache.hadoop.hbase.master.assignment.TransitRegionStateProcedure;
3636
import org.apache.hadoop.hbase.util.Bytes;
37-
import org.apache.hadoop.hbase.util.FSUtils;
3837
import org.apache.hadoop.hbase.util.Pair;
3938
import org.apache.yetus.audience.InterfaceAudience;
4039
import org.slf4j.Logger;
4140
import org.slf4j.LoggerFactory;
42-
43-
4441
import org.apache.hbase.thirdparty.com.google.common.annotations.VisibleForTesting;
4542

4643

@@ -56,6 +53,7 @@ class MetaFixer {
5653
private static final Logger LOG = LoggerFactory.getLogger(MetaFixer.class);
5754
private static final String MAX_MERGE_COUNT_KEY = "hbase.master.metafixer.max.merge.count";
5855
private static final int MAX_MERGE_COUNT_DEFAULT = 10;
56+
5957
private final MasterServices masterServices;
6058
/**
6159
* Maximum for many regions to merge at a time.
@@ -86,74 +84,133 @@ void fix() throws IOException {
8684
* If hole, it papers it over by adding a region in the filesystem and to hbase:meta.
8785
* Does not assign.
8886
*/
89-
void fixHoles(CatalogJanitor.Report report) throws IOException {
90-
List<Pair<RegionInfo, RegionInfo>> holes = report.getHoles();
87+
void fixHoles(CatalogJanitor.Report report) {
88+
final List<Pair<RegionInfo, RegionInfo>> holes = report.getHoles();
9189
if (holes.isEmpty()) {
92-
LOG.debug("No holes.");
90+
LOG.info("CatalogJanitor Report contains no holes to fix. Skipping.");
9391
return;
9492
}
95-
for (Pair<RegionInfo, RegionInfo> p: holes) {
96-
RegionInfo ri = getHoleCover(p);
97-
if (ri == null) {
98-
continue;
99-
}
100-
Configuration configuration = this.masterServices.getConfiguration();
101-
HRegion.createRegionDir(configuration, ri, FSUtils.getRootDir(configuration));
102-
// If an error here, then we'll have a region in the filesystem but not
103-
// in hbase:meta (if the below fails). Should be able to rerun the fix.
104-
// Add to hbase:meta and then update in-memory state so it knows of new
105-
// Region; addRegionToMeta adds region and adds a state column set to CLOSED.
106-
MetaTableAccessor.addRegionToMeta(this.masterServices.getConnection(), ri);
107-
this.masterServices.getAssignmentManager().getRegionStates().
108-
updateRegionState(ri, RegionState.State.CLOSED);
109-
LOG.info("Fixed hole by adding {} in CLOSED state; region NOT assigned (assign to ONLINE).",
110-
ri);
111-
}
93+
94+
LOG.info("Identified {} region holes to fix. Detailed fixup progress logged at DEBUG.",
95+
holes.size());
96+
97+
final List<RegionInfo> newRegionInfos = createRegionInfosForHoles(holes);
98+
final List<RegionInfo> newMetaEntries = createMetaEntries(masterServices, newRegionInfos);
99+
final TransitRegionStateProcedure[] assignProcedures = masterServices
100+
.getAssignmentManager()
101+
.createRoundRobinAssignProcedures(newMetaEntries);
102+
103+
masterServices.getMasterProcedureExecutor().submitProcedures(assignProcedures);
104+
LOG.info(
105+
"Scheduled {}/{} new regions for assignment.", assignProcedures.length, holes.size());
106+
}
107+
108+
/**
109+
* Create a new {@link RegionInfo} corresponding to each provided "hole" pair.
110+
*/
111+
private static List<RegionInfo> createRegionInfosForHoles(
112+
final List<Pair<RegionInfo, RegionInfo>> holes) {
113+
final List<RegionInfo> newRegionInfos = holes.stream()
114+
.map(MetaFixer::getHoleCover)
115+
.filter(Optional::isPresent)
116+
.map(Optional::get)
117+
.collect(Collectors.toList());
118+
LOG.debug("Constructed {}/{} RegionInfo descriptors corresponding to identified holes.",
119+
newRegionInfos.size(), holes.size());
120+
return newRegionInfos;
112121
}
113122

114123
/**
115-
* @return Calculated RegionInfo that covers the hole <code>hole</code>
124+
* @return Attempts to calculate a new {@link RegionInfo} that covers the region range described
125+
* in {@code hole}.
116126
*/
117-
private RegionInfo getHoleCover(Pair<RegionInfo, RegionInfo> hole) {
118-
RegionInfo holeCover = null;
119-
RegionInfo left = hole.getFirst();
120-
RegionInfo right = hole.getSecond();
127+
private static Optional<RegionInfo> getHoleCover(Pair<RegionInfo, RegionInfo> hole) {
128+
final RegionInfo left = hole.getFirst();
129+
final RegionInfo right = hole.getSecond();
130+
121131
if (left.getTable().equals(right.getTable())) {
122132
// Simple case.
123133
if (Bytes.compareTo(left.getEndKey(), right.getStartKey()) >= 0) {
124-
LOG.warn("Skipping hole fix; left-side endKey is not less than right-side startKey; " +
125-
"left=<{}>, right=<{}>", left, right);
126-
return holeCover;
127-
}
128-
holeCover = buildRegionInfo(left.getTable(), left.getEndKey(), right.getStartKey());
129-
} else {
130-
boolean leftUndefined = left.equals(RegionInfo.UNDEFINED);
131-
boolean rightUnefined = right.equals(RegionInfo.UNDEFINED);
132-
boolean last = left.isLast();
133-
boolean first = right.isFirst();
134-
if (leftUndefined && rightUnefined) {
135-
LOG.warn("Skipping hole fix; both the hole left-side and right-side RegionInfos are " +
136-
"UNDEFINED; left=<{}>, right=<{}>", left, right);
137-
return holeCover;
138-
}
139-
if (leftUndefined || last) {
140-
holeCover = buildRegionInfo(right.getTable(), HConstants.EMPTY_START_ROW,
141-
right.getStartKey());
142-
} else if (rightUnefined || first) {
143-
holeCover = buildRegionInfo(left.getTable(), left.getEndKey(), HConstants.EMPTY_END_ROW);
144-
} else {
145-
LOG.warn("Skipping hole fix; don't know what to do with left=<{}>, right=<{}>",
146-
left, right);
147-
return holeCover;
134+
LOG.warn("Skipping hole fix; left-side endKey is not less than right-side startKey;"
135+
+ " left=<{}>, right=<{}>", left, right);
136+
return Optional.empty();
148137
}
138+
return Optional.of(buildRegionInfo(left.getTable(), left.getEndKey(), right.getStartKey()));
139+
}
140+
141+
final boolean leftUndefined = left.equals(RegionInfo.UNDEFINED);
142+
final boolean rightUndefined = right.equals(RegionInfo.UNDEFINED);
143+
final boolean last = left.isLast();
144+
final boolean first = right.isFirst();
145+
if (leftUndefined && rightUndefined) {
146+
LOG.warn("Skipping hole fix; both the hole left-side and right-side RegionInfos are " +
147+
"UNDEFINED; left=<{}>, right=<{}>", left, right);
148+
return Optional.empty();
149+
}
150+
if (leftUndefined || last) {
151+
return Optional.of(
152+
buildRegionInfo(right.getTable(), HConstants.EMPTY_START_ROW, right.getStartKey()));
149153
}
150-
return holeCover;
154+
if (rightUndefined || first) {
155+
return Optional.of(
156+
buildRegionInfo(left.getTable(), left.getEndKey(), HConstants.EMPTY_END_ROW));
157+
}
158+
LOG.warn("Skipping hole fix; don't know what to do with left=<{}>, right=<{}>", left, right);
159+
return Optional.empty();
151160
}
152161

153-
private RegionInfo buildRegionInfo(TableName tn, byte [] start, byte [] end) {
162+
private static RegionInfo buildRegionInfo(TableName tn, byte [] start, byte [] end) {
154163
return RegionInfoBuilder.newBuilder(tn).setStartKey(start).setEndKey(end).build();
155164
}
156165

166+
/**
167+
* Create entries in the {@code hbase:meta} for each provided {@link RegionInfo}. Best effort.
168+
* @param masterServices used to connect to {@code hbase:meta}
169+
* @param newRegionInfos the new {@link RegionInfo} entries to add to the filesystem
170+
* @return a list of {@link RegionInfo} entries for which {@code hbase:meta} entries were
171+
* successfully created
172+
*/
173+
private static List<RegionInfo> createMetaEntries(final MasterServices masterServices,
174+
final List<RegionInfo> newRegionInfos) {
175+
176+
final List<Either<RegionInfo, IOException>> addMetaEntriesResults = newRegionInfos.stream()
177+
.map(regionInfo -> {
178+
try {
179+
MetaTableAccessor.addRegionToMeta(masterServices.getConnection(), regionInfo);
180+
masterServices.getAssignmentManager()
181+
.getRegionStates()
182+
.updateRegionState(regionInfo, RegionState.State.CLOSED);
183+
return Either.<RegionInfo, IOException>ofLeft(regionInfo);
184+
} catch (IOException e) {
185+
return Either.<RegionInfo, IOException>ofRight(e);
186+
}
187+
})
188+
.collect(Collectors.toList());
189+
final List<RegionInfo> createMetaEntriesSuccesses = addMetaEntriesResults.stream()
190+
.filter(Either::hasLeft)
191+
.map(Either::getLeft)
192+
.collect(Collectors.toList());
193+
final List<IOException> createMetaEntriesFailures = addMetaEntriesResults.stream()
194+
.filter(Either::hasRight)
195+
.map(Either::getRight)
196+
.collect(Collectors.toList());
197+
LOG.debug("Added {}/{} entries to hbase:meta",
198+
createMetaEntriesSuccesses.size(), newRegionInfos.size());
199+
200+
if (!createMetaEntriesFailures.isEmpty()) {
201+
LOG.warn("Failed to create entries in hbase:meta for {}/{} RegionInfo descriptors. First"
202+
+ " failure message included; full list of failures with accompanying stack traces is"
203+
+ " available at log level DEBUG. message={}", createMetaEntriesFailures.size(),
204+
addMetaEntriesResults.size(), createMetaEntriesFailures.get(0).getMessage());
205+
if (LOG.isDebugEnabled()) {
206+
createMetaEntriesFailures.forEach(
207+
ioe -> LOG.debug("Attempt to fix region hole in hbase:meta failed.", ioe));
208+
}
209+
}
210+
211+
return createMetaEntriesSuccesses;
212+
}
213+
157214
/**
158215
* Fix overlaps noted in CJ consistency report.
159216
*/
@@ -244,4 +301,47 @@ static boolean isOverlap(RegionInfo ri, Pair<RegionInfo, RegionInfo> pair) {
244301
}
245302
return ri.isOverlap(pair.getFirst()) || ri.isOverlap(pair.getSecond());
246303
}
304+
305+
/**
306+
* A union over {@link L} and {@link R}.
307+
*/
308+
private static class Either<L, R> {
309+
private final L left;
310+
private final R right;
311+
312+
public static <L, R> Either<L, R> ofLeft(L left) {
313+
return new Either<>(left, null);
314+
}
315+
316+
public static <L, R> Either<L, R> ofRight(R right) {
317+
return new Either<>(null, right);
318+
}
319+
320+
Either(L left, R right) {
321+
this.left = left;
322+
this.right = right;
323+
}
324+
325+
public boolean hasLeft() {
326+
return left != null;
327+
}
328+
329+
public L getLeft() {
330+
if (!hasLeft()) {
331+
throw new IllegalStateException("Either contains no left.");
332+
}
333+
return left;
334+
}
335+
336+
public boolean hasRight() {
337+
return right != null;
338+
}
339+
340+
public R getRight() {
341+
if (!hasRight()) {
342+
throw new IllegalStateException("Either contains no right.");
343+
}
344+
return right;
345+
}
346+
}
247347
}

hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMetaFixer.java

Lines changed: 34 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,13 @@
1717
*/
1818
package org.apache.hadoop.hbase.master;
1919

20+
import static org.apache.hbase.thirdparty.org.apache.commons.collections4.CollectionUtils.isNotEmpty;
2021
import static org.junit.Assert.assertEquals;
2122
import static org.junit.Assert.assertTrue;
22-
2323
import java.io.IOException;
2424
import java.util.Collections;
2525
import java.util.List;
26-
26+
import java.util.function.BooleanSupplier;
2727
import org.apache.hadoop.hbase.HBaseClassTestRule;
2828
import org.apache.hadoop.hbase.HBaseTestingUtility;
2929
import org.apache.hadoop.hbase.HConstants;
@@ -34,7 +34,6 @@
3434
import org.apache.hadoop.hbase.testclassification.LargeTests;
3535
import org.apache.hadoop.hbase.testclassification.MasterTests;
3636
import org.apache.hadoop.hbase.util.Threads;
37-
3837
import org.junit.AfterClass;
3938
import org.junit.BeforeClass;
4039
import org.junit.ClassRule;
@@ -70,7 +69,7 @@ private void deleteRegion(MasterServices services, RegionInfo ri) throws IOExcep
7069
}
7170

7271
@Test
73-
public void testPlugsHoles() throws IOException {
72+
public void testPlugsHoles() throws Exception {
7473
TableName tn = TableName.valueOf(this.name.getMethodName());
7574
TEST_UTIL.createMultiRegionTable(tn, HConstants.CATALOG_FAMILY);
7675
List<RegionInfo> ris = MetaTableAccessor.getTableRegions(TEST_UTIL.getConnection(), tn);
@@ -96,9 +95,11 @@ public void testPlugsHoles() throws IOException {
9695
assertTrue(report.toString(), report.isEmpty());
9796
assertEquals(initialSize,
9897
services.getAssignmentManager().getRegionStates().getRegionStates().size());
99-
// Disable and reenable so the added regions get reassigned.
100-
TEST_UTIL.getAdmin().disableTable(tn);
101-
TEST_UTIL.getAdmin().enableTable(tn);
98+
99+
// wait for RITs to settle -- those are the fixed regions being assigned -- or until the
100+
// watchdog TestRule terminates the test.
101+
await(50, () -> isNotEmpty(services.getAssignmentManager().getRegionsInTransition()));
102+
102103
ris = MetaTableAccessor.getTableRegions(TEST_UTIL.getConnection(), tn);
103104
assertEquals(originalCount, ris.size());
104105
}
@@ -143,7 +144,7 @@ private static void makeOverlap(MasterServices services, RegionInfo a, RegionInf
143144
}
144145

145146
@Test
146-
public void testOverlap() throws IOException {
147+
public void testOverlap() throws Exception {
147148
TableName tn = TableName.valueOf(this.name.getMethodName());
148149
TEST_UTIL.createMultiRegionTable(tn, HConstants.CATALOG_FAMILY);
149150
List<RegionInfo> ris = MetaTableAccessor.getTableRegions(TEST_UTIL.getConnection(), tn);
@@ -163,14 +164,32 @@ public void testOverlap() throws IOException {
163164
assertEquals(1, MetaFixer.calculateMerges(10, report.getOverlaps()).size());
164165
MetaFixer fixer = new MetaFixer(services);
165166
fixer.fixOverlaps(report);
166-
while (true) {
167-
services.getCatalogJanitor().scan();
168-
report = services.getCatalogJanitor().getLastReport();
169-
if (report.isEmpty()) {
170-
break;
167+
await(10, () -> {
168+
try {
169+
services.getCatalogJanitor().scan();
170+
final CatalogJanitor.Report postReport = services.getCatalogJanitor().getLastReport();
171+
return postReport.isEmpty();
172+
} catch (Exception e) {
173+
throw new RuntimeException(e);
174+
}
175+
});
176+
}
177+
178+
/**
179+
* Await the successful return of {@code condition}, sleeping {@code sleepMillis} between
180+
* invocations.
181+
*/
182+
private static void await(final long sleepMillis, final BooleanSupplier condition)
183+
throws InterruptedException {
184+
try {
185+
while (!condition.getAsBoolean()) {
186+
Thread.sleep(sleepMillis);
187+
}
188+
} catch (RuntimeException e) {
189+
if (e.getCause() instanceof AssertionError) {
190+
throw (AssertionError) e.getCause();
171191
}
172-
Threads.sleep(10);
192+
throw e;
173193
}
174-
assertTrue(report.toString(), report.isEmpty());
175194
}
176195
}

0 commit comments

Comments
 (0)