Skip to content

Commit a7f776f

Browse files
committed
HBASE-27579 CatalogJanitor can cause data loss due to errors during cleanMergeRegion (apache#4986)
Signed-off-by: Andrew Purtell <apurtell@apache.org> Signed-off-by: Viraj Jasani <vjasani@apache.org> Signed-off-by: Duo Zhang <zhangduo@apache.org>
1 parent 67a30f3 commit a7f776f

2 files changed

Lines changed: 147 additions & 56 deletions

File tree

hbase-server/src/main/java/org/apache/hadoop/hbase/master/janitor/CatalogJanitor.java

Lines changed: 45 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,6 @@
3434
import org.apache.hadoop.hbase.MetaTableAccessor;
3535
import org.apache.hadoop.hbase.ScheduledChore;
3636
import org.apache.hadoop.hbase.TableName;
37-
import org.apache.hadoop.hbase.client.ColumnFamilyDescriptor;
3837
import org.apache.hadoop.hbase.client.Connection;
3938
import org.apache.hadoop.hbase.client.ConnectionFactory;
4039
import org.apache.hadoop.hbase.client.Get;
@@ -184,13 +183,13 @@ public int scan() throws IOException {
184183
if (this.services.isInMaintenanceMode()) {
185184
// Stop cleaning if the master is in maintenance mode
186185
if (LOG.isDebugEnabled()) {
187-
LOG.debug("In maintenence mode, not cleaning");
186+
LOG.debug("In mainteneace mode, not cleaning");
188187
}
189188
break;
190189
}
191190

192191
List<RegionInfo> parents = MetaTableAccessor.getMergeRegions(e.getValue().rawCells());
193-
if (parents != null && cleanMergeRegion(e.getKey(), parents)) {
192+
if (parents != null && cleanMergeRegion(this.services, e.getKey(), parents)) {
194193
gcs++;
195194
}
196195
}
@@ -203,7 +202,7 @@ public int scan() throws IOException {
203202
if (this.services.isInMaintenanceMode()) {
204203
// Stop cleaning if the master is in maintenance mode
205204
if (LOG.isDebugEnabled()) {
206-
LOG.debug("In maintenence mode, not cleaning");
205+
LOG.debug("In maintenance mode, not cleaning");
207206
}
208207
break;
209208
}
@@ -250,30 +249,24 @@ public CatalogJanitorReport getLastReport() {
250249
* @return true if we delete references in merged region on hbase:meta and archive the files on
251250
* the file system
252251
*/
253-
private boolean cleanMergeRegion(final RegionInfo mergedRegion, List<RegionInfo> parents)
254-
throws IOException {
252+
static boolean cleanMergeRegion(MasterServices services, final RegionInfo mergedRegion,
253+
List<RegionInfo> parents) throws IOException {
255254
if (LOG.isDebugEnabled()) {
256255
LOG.debug("Cleaning merged region {}", mergedRegion);
257256
}
258-
FileSystem fs = this.services.getMasterFileSystem().getFileSystem();
259-
Path rootdir = this.services.getMasterFileSystem().getRootDir();
260-
Path tabledir = CommonFSUtils.getTableDir(rootdir, mergedRegion.getTable());
261-
TableDescriptor htd = getDescriptor(mergedRegion.getTable());
262-
HRegionFileSystem regionFs = null;
263-
try {
264-
regionFs = HRegionFileSystem.openRegionFromFileSystem(this.services.getConfiguration(), fs,
265-
tabledir, mergedRegion, true);
266-
} catch (IOException e) {
267-
LOG.warn("Merged region does not exist: " + mergedRegion.getEncodedName());
268-
}
269-
if (regionFs == null || !regionFs.hasReferences(htd)) {
257+
258+
Pair<Boolean, Boolean> result =
259+
checkRegionReferences(services, mergedRegion.getTable(), mergedRegion);
260+
261+
if (hasNoReferences(result)) {
270262
if (LOG.isDebugEnabled()) {
271263
LOG.debug(
272264
"Deleting parents ({}) from fs; merged child {} no longer holds references", parents
273265
.stream().map(r -> RegionInfo.getShortNameToLog(r)).collect(Collectors.joining(", ")),
274266
mergedRegion);
275267
}
276-
ProcedureExecutor<MasterProcedureEnv> pe = this.services.getMasterProcedureExecutor();
268+
269+
ProcedureExecutor<MasterProcedureEnv> pe = services.getMasterProcedureExecutor();
277270
GCMultipleMergedRegionsProcedure mergeRegionProcedure =
278271
new GCMultipleMergedRegionsProcedure(pe.getEnvironment(), mergedRegion, parents);
279272
pe.submitProcedure(mergeRegionProcedure);
@@ -282,7 +275,15 @@ private boolean cleanMergeRegion(final RegionInfo mergedRegion, List<RegionInfo>
282275
mergedRegion);
283276
}
284277
return true;
278+
} else {
279+
if (LOG.isDebugEnabled()) {
280+
LOG.debug(
281+
"Deferring cleanup up of {} parents of merged region {}, because references "
282+
+ "still exist in merged region or we encountered an exception in checking",
283+
parents.size(), mergedRegion.getEncodedName());
284+
}
285285
}
286+
286287
return false;
287288
}
288289

@@ -334,8 +335,10 @@ static boolean cleanParent(MasterServices services, RegionInfo parent, Result ro
334335
}
335336
// Run checks on each daughter split.
336337
PairOfSameType<RegionInfo> daughters = MetaTableAccessor.getDaughterRegions(rowContent);
337-
Pair<Boolean, Boolean> a = checkDaughterInFs(services, parent, daughters.getFirst());
338-
Pair<Boolean, Boolean> b = checkDaughterInFs(services, parent, daughters.getSecond());
338+
Pair<Boolean, Boolean> a =
339+
checkRegionReferences(services, parent.getTable(), daughters.getFirst());
340+
Pair<Boolean, Boolean> b =
341+
checkRegionReferences(services, parent.getTable(), daughters.getSecond());
339342
if (hasNoReferences(a) && hasNoReferences(b)) {
340343
String daughterA =
341344
daughters.getFirst() != null ? daughters.getFirst().getShortNameToLog() : "null";
@@ -388,59 +391,45 @@ private static boolean hasNoReferences(final Pair<Boolean, Boolean> p) {
388391
}
389392

390393
/**
391-
* Checks if a daughter region -- either splitA or splitB -- still holds references to parent.
392-
* @param parent Parent region
393-
* @param daughter Daughter region
394-
* @return A pair where the first boolean says whether or not the daughter region directory exists
395-
* in the filesystem and then the second boolean says whether the daughter has references
396-
* to the parent.
394+
* Checks if a region still holds references to parent.
395+
* @param tableName The table for the region
396+
* @param region The region to check
397+
* @return A pair where the first boolean says whether the region directory exists in the
398+
* filesystem and then the second boolean says whether the region has references to a
399+
* parent.
397400
*/
398-
private static Pair<Boolean, Boolean> checkDaughterInFs(MasterServices services,
399-
final RegionInfo parent, final RegionInfo daughter) throws IOException {
400-
if (daughter == null) {
401+
private static Pair<Boolean, Boolean> checkRegionReferences(MasterServices services,
402+
TableName tableName, RegionInfo region) throws IOException {
403+
if (region == null) {
401404
return new Pair<>(Boolean.FALSE, Boolean.FALSE);
402405
}
403406

404407
FileSystem fs = services.getMasterFileSystem().getFileSystem();
405408
Path rootdir = services.getMasterFileSystem().getRootDir();
406-
Path tabledir = CommonFSUtils.getTableDir(rootdir, daughter.getTable());
407-
408-
Path daughterRegionDir = new Path(tabledir, daughter.getEncodedName());
409-
410-
HRegionFileSystem regionFs;
409+
Path tabledir = CommonFSUtils.getTableDir(rootdir, tableName);
410+
Path regionDir = new Path(tabledir, region.getEncodedName());
411411

412412
try {
413-
if (!CommonFSUtils.isExists(fs, daughterRegionDir)) {
413+
if (!CommonFSUtils.isExists(fs, regionDir)) {
414414
return new Pair<>(Boolean.FALSE, Boolean.FALSE);
415415
}
416416
} catch (IOException ioe) {
417-
LOG.error("Error trying to determine if daughter region exists, "
418-
+ "assuming exists and has references", ioe);
417+
LOG.error("Error trying to determine if region exists, assuming exists and has references",
418+
ioe);
419419
return new Pair<>(Boolean.TRUE, Boolean.TRUE);
420420
}
421421

422-
boolean references = false;
423-
TableDescriptor parentDescriptor = services.getTableDescriptors().get(parent.getTable());
422+
TableDescriptor tableDescriptor = services.getTableDescriptors().get(tableName);
424423
try {
425-
regionFs = HRegionFileSystem.openRegionFromFileSystem(services.getConfiguration(), fs,
426-
tabledir, daughter, true);
427-
428-
for (ColumnFamilyDescriptor family : parentDescriptor.getColumnFamilies()) {
429-
references = regionFs.hasReferences(family.getNameAsString());
430-
if (references) {
431-
break;
432-
}
433-
}
424+
HRegionFileSystem regionFs = HRegionFileSystem
425+
.openRegionFromFileSystem(services.getConfiguration(), fs, tabledir, region, true);
426+
boolean references = regionFs.hasReferences(tableDescriptor);
427+
return new Pair<>(Boolean.TRUE, references);
434428
} catch (IOException e) {
435-
LOG.error("Error trying to determine referenced files from : " + daughter.getEncodedName()
436-
+ ", to: " + parent.getEncodedName() + " assuming has references", e);
429+
LOG.error("Error trying to determine if region {} has references, assuming it does",
430+
region.getEncodedName(), e);
437431
return new Pair<>(Boolean.TRUE, Boolean.TRUE);
438432
}
439-
return new Pair<>(Boolean.TRUE, references);
440-
}
441-
442-
private TableDescriptor getDescriptor(final TableName tableName) throws IOException {
443-
return this.services.getTableDescriptors().get(tableName);
444433
}
445434

446435
private void updateAssignmentManagerMetrics() {

hbase-server/src/test/java/org/apache/hadoop/hbase/master/janitor/TestCatalogJanitor.java

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,12 @@
2121
import static org.junit.Assert.assertEquals;
2222
import static org.junit.Assert.assertFalse;
2323
import static org.junit.Assert.assertTrue;
24+
import static org.mockito.Mockito.any;
25+
import static org.mockito.Mockito.doAnswer;
2426
import static org.mockito.Mockito.doReturn;
27+
import static org.mockito.Mockito.doThrow;
2528
import static org.mockito.Mockito.spy;
29+
import static org.mockito.Mockito.when;
2630

2731
import java.io.IOException;
2832
import java.util.ArrayList;
@@ -34,6 +38,7 @@
3438
import java.util.SortedSet;
3539
import java.util.TreeMap;
3640
import java.util.concurrent.ConcurrentSkipListMap;
41+
import java.util.concurrent.atomic.AtomicBoolean;
3742
import org.apache.hadoop.fs.FSDataOutputStream;
3843
import org.apache.hadoop.fs.FileStatus;
3944
import org.apache.hadoop.fs.FileSystem;
@@ -46,10 +51,12 @@
4651
import org.apache.hadoop.hbase.MetaMockingUtil;
4752
import org.apache.hadoop.hbase.ServerName;
4853
import org.apache.hadoop.hbase.TableName;
54+
import org.apache.hadoop.hbase.client.RegionInfo;
4955
import org.apache.hadoop.hbase.client.Result;
5056
import org.apache.hadoop.hbase.client.TableDescriptor;
5157
import org.apache.hadoop.hbase.client.TableDescriptorBuilder;
5258
import org.apache.hadoop.hbase.io.Reference;
59+
import org.apache.hadoop.hbase.master.MasterFileSystem;
5360
import org.apache.hadoop.hbase.master.MasterServices;
5461
import org.apache.hadoop.hbase.master.assignment.MockMasterServices;
5562
import org.apache.hadoop.hbase.master.janitor.CatalogJanitor.SplitParentFirstComparator;
@@ -61,6 +68,7 @@
6168
import org.apache.hadoop.hbase.testclassification.MediumTests;
6269
import org.apache.hadoop.hbase.util.Bytes;
6370
import org.apache.hadoop.hbase.util.CommonFSUtils;
71+
import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
6472
import org.apache.hadoop.hbase.util.HFileArchiveUtil;
6573
import org.apache.zookeeper.KeeperException;
6674
import org.junit.After;
@@ -113,6 +121,100 @@ public void teardown() {
113121
this.masterServices.stop("DONE");
114122
}
115123

124+
@Test
125+
public void testCleanMerge() throws IOException {
126+
TableDescriptor td = createTableDescriptorForCurrentMethod();
127+
// Create regions.
128+
HRegionInfo merged =
129+
new HRegionInfo(td.getTableName(), Bytes.toBytes("aaa"), Bytes.toBytes("eee"));
130+
HRegionInfo parenta =
131+
new HRegionInfo(td.getTableName(), Bytes.toBytes("aaa"), Bytes.toBytes("ccc"));
132+
HRegionInfo parentb =
133+
new HRegionInfo(td.getTableName(), Bytes.toBytes("ccc"), Bytes.toBytes("eee"));
134+
135+
List<RegionInfo> parents = new ArrayList<>();
136+
parents.add(parenta);
137+
parents.add(parentb);
138+
139+
Path rootdir = this.masterServices.getMasterFileSystem().getRootDir();
140+
Path tabledir = CommonFSUtils.getTableDir(rootdir, td.getTableName());
141+
Path storedir =
142+
HRegionFileSystem.getStoreHomedir(tabledir, merged, td.getColumnFamilies()[0].getName());
143+
144+
Path parentaRef = createMergeReferenceFile(storedir, merged, parenta);
145+
Path parentbRef = createMergeReferenceFile(storedir, merged, parentb);
146+
147+
// references exist, should not clean
148+
assertFalse(CatalogJanitor.cleanMergeRegion(masterServices, merged, parents));
149+
150+
masterServices.getMasterFileSystem().getFileSystem().delete(parentaRef, false);
151+
152+
// one reference still exists, should not clean
153+
assertFalse(CatalogJanitor.cleanMergeRegion(masterServices, merged, parents));
154+
155+
masterServices.getMasterFileSystem().getFileSystem().delete(parentbRef, false);
156+
157+
// all references removed, should clean
158+
assertTrue(CatalogJanitor.cleanMergeRegion(masterServices, merged, parents));
159+
}
160+
161+
@Test
162+
public void testDontCleanMergeIfFileSystemException() throws IOException {
163+
TableDescriptor td = createTableDescriptorForCurrentMethod();
164+
// Create regions.
165+
HRegionInfo merged =
166+
new HRegionInfo(td.getTableName(), Bytes.toBytes("aaa"), Bytes.toBytes("eee"));
167+
HRegionInfo parenta =
168+
new HRegionInfo(td.getTableName(), Bytes.toBytes("aaa"), Bytes.toBytes("ccc"));
169+
HRegionInfo parentb =
170+
new HRegionInfo(td.getTableName(), Bytes.toBytes("ccc"), Bytes.toBytes("eee"));
171+
172+
List<RegionInfo> parents = new ArrayList<>();
173+
parents.add(parenta);
174+
parents.add(parentb);
175+
176+
Path rootdir = this.masterServices.getMasterFileSystem().getRootDir();
177+
Path tabledir = CommonFSUtils.getTableDir(rootdir, td.getTableName());
178+
Path storedir =
179+
HRegionFileSystem.getStoreHomedir(tabledir, merged, td.getColumnFamilies()[0].getName());
180+
createMergeReferenceFile(storedir, merged, parenta);
181+
182+
MasterServices mockedMasterServices = spy(masterServices);
183+
MasterFileSystem mockedMasterFileSystem = spy(masterServices.getMasterFileSystem());
184+
FileSystem mockedFileSystem = spy(masterServices.getMasterFileSystem().getFileSystem());
185+
186+
when(mockedMasterServices.getMasterFileSystem()).thenReturn(mockedMasterFileSystem);
187+
when(mockedMasterFileSystem.getFileSystem()).thenReturn(mockedFileSystem);
188+
189+
// throw on the first exists check
190+
doThrow(new IOException("Some exception")).when(mockedFileSystem).exists(any());
191+
192+
assertFalse(CatalogJanitor.cleanMergeRegion(mockedMasterServices, merged, parents));
193+
194+
// throw on the second exists check (within HRegionfileSystem)
195+
AtomicBoolean returned = new AtomicBoolean(false);
196+
doAnswer(invocationOnMock -> {
197+
if (!returned.get()) {
198+
returned.set(true);
199+
return masterServices.getMasterFileSystem().getFileSystem()
200+
.exists(invocationOnMock.getArgument(0));
201+
}
202+
throw new IOException("Some exception");
203+
}).when(mockedFileSystem).exists(any());
204+
205+
assertFalse(CatalogJanitor.cleanMergeRegion(mockedMasterServices, merged, parents));
206+
}
207+
208+
private Path createMergeReferenceFile(Path storeDir, HRegionInfo mergedRegion,
209+
HRegionInfo parentRegion) throws IOException {
210+
Reference ref = Reference.createTopReference(mergedRegion.getStartKey());
211+
long now = EnvironmentEdgeManager.currentTime();
212+
// Reference name has this format: StoreFile#REF_NAME_PARSER
213+
Path p = new Path(storeDir, Long.toString(now) + "." + parentRegion.getEncodedName());
214+
FileSystem fs = this.masterServices.getMasterFileSystem().getFileSystem();
215+
return ref.write(fs, p);
216+
}
217+
116218
/**
117219
* Test clearing a split parent.
118220
*/

0 commit comments

Comments
 (0)