001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.regionserver.wal; 019 020import static org.junit.Assert.assertThrows; 021import static org.junit.Assert.assertTrue; 022 023import java.io.FileNotFoundException; 024import java.io.IOException; 025import java.util.Arrays; 026import java.util.List; 027import java.util.NavigableMap; 028import java.util.TreeMap; 029import org.apache.hadoop.conf.Configuration; 030import org.apache.hadoop.fs.FileSystem; 031import org.apache.hadoop.fs.Path; 032import org.apache.hadoop.hbase.HBaseClassTestRule; 033import org.apache.hadoop.hbase.HBaseTestingUtil; 034import org.apache.hadoop.hbase.HConstants; 035import org.apache.hadoop.hbase.KeyValue; 036import org.apache.hadoop.hbase.ServerName; 037import org.apache.hadoop.hbase.SingleProcessHBaseCluster; 038import org.apache.hadoop.hbase.TableName; 039import org.apache.hadoop.hbase.client.Admin; 040import org.apache.hadoop.hbase.client.ColumnFamilyDescriptorBuilder; 041import org.apache.hadoop.hbase.client.Put; 042import org.apache.hadoop.hbase.client.RegionInfo; 043import org.apache.hadoop.hbase.client.RegionInfoBuilder; 044import org.apache.hadoop.hbase.client.Table; 045import org.apache.hadoop.hbase.client.TableDescriptor; 046import org.apache.hadoop.hbase.client.TableDescriptorBuilder; 047import org.apache.hadoop.hbase.log.HBaseMarkers; 048import org.apache.hadoop.hbase.regionserver.HRegionServer; 049import org.apache.hadoop.hbase.regionserver.MultiVersionConcurrencyControl; 050import org.apache.hadoop.hbase.testclassification.MediumTests; 051import org.apache.hadoop.hbase.testclassification.RegionServerTests; 052import org.apache.hadoop.hbase.util.Bytes; 053import org.apache.hadoop.hbase.util.CommonFSUtils; 054import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; 055import org.apache.hadoop.hbase.wal.AbstractFSWALProvider; 056import org.apache.hadoop.hbase.wal.WAL; 057import org.apache.hadoop.hbase.wal.WALEdit; 058import org.apache.hadoop.hbase.wal.WALEditInternalHelper; 059import org.apache.hadoop.hbase.wal.WALFactory; 060import org.apache.hadoop.hbase.wal.WALKeyImpl; 061import org.apache.hadoop.hbase.wal.WALProvider; 062import org.apache.hadoop.hbase.wal.WALSplitter; 063import org.apache.hadoop.hdfs.MiniDFSCluster; 064import org.apache.hadoop.ipc.RemoteException; 065import org.junit.After; 066import org.junit.Before; 067import org.junit.BeforeClass; 068import org.junit.ClassRule; 069import org.junit.Test; 070import org.junit.experimental.categories.Category; 071import org.junit.runner.RunWith; 072import org.junit.runners.Parameterized; 073import org.junit.runners.Parameterized.Parameter; 074import org.junit.runners.Parameterized.Parameters; 075import org.slf4j.Logger; 076import org.slf4j.LoggerFactory; 077 078import org.apache.hbase.thirdparty.com.google.common.base.Throwables; 079 080/** 081 * Tests for conditions that should trigger RegionServer aborts when rolling the current WAL fails. 082 */ 083@RunWith(Parameterized.class) 084@Category({ RegionServerTests.class, MediumTests.class }) 085public class TestLogRollAbort { 086 087 @ClassRule 088 public static final HBaseClassTestRule CLASS_RULE = 089 HBaseClassTestRule.forClass(TestLogRollAbort.class); 090 091 private static final Logger LOG = LoggerFactory.getLogger(AbstractTestLogRolling.class); 092 private static MiniDFSCluster dfsCluster; 093 private static Admin admin; 094 private static SingleProcessHBaseCluster cluster; 095 protected final static HBaseTestingUtil TEST_UTIL = new HBaseTestingUtil(); 096 097 /* For the split-then-roll test */ 098 private static final Path HBASEDIR = new Path("/hbase"); 099 private static final Path HBASELOGDIR = new Path("/hbaselog"); 100 private static final Path OLDLOGDIR = new Path(HBASELOGDIR, HConstants.HREGION_OLDLOGDIR_NAME); 101 102 // Need to override this setup so we can edit the config before it gets sent 103 // to the HDFS & HBase cluster startup. 104 @BeforeClass 105 public static void setUpBeforeClass() throws Exception { 106 // Tweak default timeout values down for faster recovery 107 TEST_UTIL.getConfiguration().setInt("hbase.regionserver.logroll.errors.tolerated", 2); 108 TEST_UTIL.getConfiguration().setInt("hbase.rpc.timeout", 10 * 1000); 109 110 // Increase the amount of time between client retries 111 TEST_UTIL.getConfiguration().setLong("hbase.client.pause", 5 * 1000); 112 113 // lower the namenode & datanode heartbeat so the namenode 114 // quickly detects datanode failures 115 TEST_UTIL.getConfiguration().setInt("dfs.namenode.heartbeat.recheck-interval", 5000); 116 TEST_UTIL.getConfiguration().setInt("dfs.heartbeat.interval", 1); 117 // the namenode might still try to choose the recently-dead datanode 118 // for a pipeline, so try to a new pipeline multiple times 119 TEST_UTIL.getConfiguration().setInt("dfs.client.block.write.retries", 10); 120 TEST_UTIL.getConfiguration().set(WALFactory.WAL_PROVIDER, "asyncfs"); 121 } 122 123 @Parameters(name = "{index}: walProvider={0}") 124 public static List<Object[]> params() { 125 return Arrays.asList(new Object[] { "filesystem" }, new Object[] { "asyncfs" }); 126 } 127 128 private Configuration conf; 129 private FileSystem fs; 130 131 @Parameter 132 public String walProvider; 133 134 @Before 135 public void setUp() throws Exception { 136 TEST_UTIL.getConfiguration().set(WALFactory.WAL_PROVIDER, walProvider); 137 TEST_UTIL.startMiniCluster(2); 138 139 cluster = TEST_UTIL.getHBaseCluster(); 140 dfsCluster = TEST_UTIL.getDFSCluster(); 141 admin = TEST_UTIL.getAdmin(); 142 conf = TEST_UTIL.getConfiguration(); 143 fs = TEST_UTIL.getDFSCluster().getFileSystem(); 144 145 // disable region rebalancing (interferes with log watching) 146 cluster.getMaster().balanceSwitch(false); 147 CommonFSUtils.setRootDir(conf, HBASEDIR); 148 CommonFSUtils.setWALRootDir(conf, HBASELOGDIR); 149 } 150 151 @After 152 public void tearDown() throws Exception { 153 TEST_UTIL.shutdownMiniCluster(); 154 } 155 156 /** 157 * Tests that RegionServer aborts if we hit an error closing the WAL when there are unsynced WAL 158 * edits. See HBASE-4282. 159 */ 160 @Test 161 public void testRSAbortWithUnflushedEdits() throws Exception { 162 LOG.info("Starting testRSAbortWithUnflushedEdits()"); 163 164 // When the hbase:meta table can be opened, the region servers are running 165 TEST_UTIL.getConnection().getTable(TableName.META_TABLE_NAME).close(); 166 167 // Create the test table and open it 168 TableName tableName = TableName.valueOf(this.getClass().getSimpleName()); 169 TableDescriptor desc = TableDescriptorBuilder.newBuilder(tableName) 170 .setColumnFamily(ColumnFamilyDescriptorBuilder.of(HConstants.CATALOG_FAMILY)).build(); 171 172 admin.createTable(desc); 173 Table table = TEST_UTIL.getConnection().getTable(tableName); 174 try { 175 HRegionServer server = TEST_UTIL.getRSForFirstRegionInTable(tableName); 176 WAL log = server.getWAL(null); 177 178 Put p = new Put(Bytes.toBytes("row2001")); 179 p.addColumn(HConstants.CATALOG_FAMILY, Bytes.toBytes("col"), Bytes.toBytes(2001)); 180 table.put(p); 181 182 log.sync(); 183 184 p = new Put(Bytes.toBytes("row2002")); 185 p.addColumn(HConstants.CATALOG_FAMILY, Bytes.toBytes("col"), Bytes.toBytes(2002)); 186 table.put(p); 187 188 dfsCluster.restartDataNodes(); 189 LOG.info("Restarted datanodes"); 190 191 try { 192 log.rollWriter(true); 193 } catch (FailedLogCloseException flce) { 194 // Expected exception. We used to expect that there would be unsynced appends but this 195 // not reliable now that sync plays a roll in wall rolling. The above puts also now call 196 // sync. 197 } catch (Throwable t) { 198 LOG.error(HBaseMarkers.FATAL, "FAILED TEST: Got wrong exception", t); 199 } 200 } finally { 201 table.close(); 202 } 203 } 204 205 /** 206 * Tests the case where a RegionServer enters a GC pause, comes back online after the master 207 * declared it dead and started to split. Want log rolling after a master split to fail. See 208 * HBASE-2312. 209 */ 210 @Test 211 public void testLogRollAfterSplitStart() throws IOException { 212 LOG.info("Verify wal roll after split starts will fail."); 213 String logName = 214 ServerName.valueOf("testLogRollAfterSplitStart", 16010, EnvironmentEdgeManager.currentTime()) 215 .toString(); 216 Path thisTestsDir = new Path(HBASELOGDIR, AbstractFSWALProvider.getWALDirectoryName(logName)); 217 final WALFactory wals = new WALFactory(conf, logName); 218 219 try { 220 // put some entries in an WAL 221 TableName tableName = TableName.valueOf(this.getClass().getName()); 222 RegionInfo regionInfo = RegionInfoBuilder.newBuilder(tableName).build(); 223 WAL log = wals.getWAL(regionInfo); 224 MultiVersionConcurrencyControl mvcc = new MultiVersionConcurrencyControl(1); 225 226 int total = 20; 227 for (int i = 0; i < total; i++) { 228 WALEdit kvs = new WALEdit(); 229 WALEditInternalHelper.addExtendedCell(kvs, 230 new KeyValue(Bytes.toBytes(i), tableName.getName(), tableName.getName())); 231 NavigableMap<byte[], Integer> scopes = new TreeMap<>(Bytes.BYTES_COMPARATOR); 232 scopes.put(Bytes.toBytes("column"), 0); 233 log.appendData(regionInfo, new WALKeyImpl(regionInfo.getEncodedNameAsBytes(), tableName, 234 EnvironmentEdgeManager.currentTime(), mvcc, scopes), kvs); 235 } 236 // Send the data to HDFS datanodes and close the HDFS writer 237 log.sync(); 238 closeWriter((AbstractFSWAL<?>) log); 239 240 // code taken from MasterFileSystem.getLogDirs(), which is called from 241 // MasterFileSystem.splitLog() handles RS shutdowns (as observed by the splitting process) 242 // rename the directory so a rogue RS doesn't create more WALs 243 Path rsSplitDir = thisTestsDir.suffix(AbstractFSWALProvider.SPLITTING_EXT); 244 if (!fs.rename(thisTestsDir, rsSplitDir)) { 245 throw new IOException("Failed fs.rename for log split: " + thisTestsDir); 246 } 247 LOG.debug("Renamed region directory: " + rsSplitDir); 248 249 LOG.debug("Processing the old log files."); 250 WALSplitter.split(HBASELOGDIR, rsSplitDir, OLDLOGDIR, fs, conf, wals); 251 252 LOG.debug("Trying to roll the WAL."); 253 IOException error = assertThrows(IOException.class, () -> log.rollWriter()); 254 if (error instanceof RemoteException) { 255 error = ((RemoteException) error).unwrapRemoteException(); 256 } 257 assertTrue("unexpected error: " + Throwables.getStackTraceAsString(error), 258 error instanceof FileNotFoundException 259 || error.getCause() instanceof FileNotFoundException); 260 } finally { 261 wals.close(); 262 if (fs.exists(thisTestsDir)) { 263 fs.delete(thisTestsDir, true); 264 } 265 } 266 } 267 268 private <W extends WALProvider.WriterBase> void closeWriter(AbstractFSWAL<W> wal) { 269 wal.waitForSafePoint(); 270 long oldFileLen = wal.writer.getLength(); 271 wal.closeWriter(wal.writer, wal.getOldPath()); 272 wal.logRollAndSetupWalProps(wal.getOldPath(), null, oldFileLen); 273 wal.writer = null; 274 wal.onWriterReplaced(null); 275 wal.rollRequested.set(false); 276 } 277}