001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.regionserver.wal;
019
020import static org.junit.Assert.assertThrows;
021import static org.junit.Assert.assertTrue;
022
023import java.io.FileNotFoundException;
024import java.io.IOException;
025import java.util.Arrays;
026import java.util.List;
027import java.util.NavigableMap;
028import java.util.TreeMap;
029import org.apache.hadoop.conf.Configuration;
030import org.apache.hadoop.fs.FileSystem;
031import org.apache.hadoop.fs.Path;
032import org.apache.hadoop.hbase.HBaseClassTestRule;
033import org.apache.hadoop.hbase.HBaseTestingUtil;
034import org.apache.hadoop.hbase.HConstants;
035import org.apache.hadoop.hbase.KeyValue;
036import org.apache.hadoop.hbase.ServerName;
037import org.apache.hadoop.hbase.SingleProcessHBaseCluster;
038import org.apache.hadoop.hbase.TableName;
039import org.apache.hadoop.hbase.client.Admin;
040import org.apache.hadoop.hbase.client.ColumnFamilyDescriptorBuilder;
041import org.apache.hadoop.hbase.client.Put;
042import org.apache.hadoop.hbase.client.RegionInfo;
043import org.apache.hadoop.hbase.client.RegionInfoBuilder;
044import org.apache.hadoop.hbase.client.Table;
045import org.apache.hadoop.hbase.client.TableDescriptor;
046import org.apache.hadoop.hbase.client.TableDescriptorBuilder;
047import org.apache.hadoop.hbase.log.HBaseMarkers;
048import org.apache.hadoop.hbase.regionserver.HRegionServer;
049import org.apache.hadoop.hbase.regionserver.MultiVersionConcurrencyControl;
050import org.apache.hadoop.hbase.testclassification.MediumTests;
051import org.apache.hadoop.hbase.testclassification.RegionServerTests;
052import org.apache.hadoop.hbase.util.Bytes;
053import org.apache.hadoop.hbase.util.CommonFSUtils;
054import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
055import org.apache.hadoop.hbase.wal.AbstractFSWALProvider;
056import org.apache.hadoop.hbase.wal.WAL;
057import org.apache.hadoop.hbase.wal.WALEdit;
058import org.apache.hadoop.hbase.wal.WALEditInternalHelper;
059import org.apache.hadoop.hbase.wal.WALFactory;
060import org.apache.hadoop.hbase.wal.WALKeyImpl;
061import org.apache.hadoop.hbase.wal.WALProvider;
062import org.apache.hadoop.hbase.wal.WALSplitter;
063import org.apache.hadoop.hdfs.MiniDFSCluster;
064import org.apache.hadoop.ipc.RemoteException;
065import org.junit.After;
066import org.junit.Before;
067import org.junit.BeforeClass;
068import org.junit.ClassRule;
069import org.junit.Test;
070import org.junit.experimental.categories.Category;
071import org.junit.runner.RunWith;
072import org.junit.runners.Parameterized;
073import org.junit.runners.Parameterized.Parameter;
074import org.junit.runners.Parameterized.Parameters;
075import org.slf4j.Logger;
076import org.slf4j.LoggerFactory;
077
078import org.apache.hbase.thirdparty.com.google.common.base.Throwables;
079
080/**
081 * Tests for conditions that should trigger RegionServer aborts when rolling the current WAL fails.
082 */
083@RunWith(Parameterized.class)
084@Category({ RegionServerTests.class, MediumTests.class })
085public class TestLogRollAbort {
086
087  @ClassRule
088  public static final HBaseClassTestRule CLASS_RULE =
089    HBaseClassTestRule.forClass(TestLogRollAbort.class);
090
091  private static final Logger LOG = LoggerFactory.getLogger(AbstractTestLogRolling.class);
092  private static MiniDFSCluster dfsCluster;
093  private static Admin admin;
094  private static SingleProcessHBaseCluster cluster;
095  protected final static HBaseTestingUtil TEST_UTIL = new HBaseTestingUtil();
096
097  /* For the split-then-roll test */
098  private static final Path HBASEDIR = new Path("/hbase");
099  private static final Path HBASELOGDIR = new Path("/hbaselog");
100  private static final Path OLDLOGDIR = new Path(HBASELOGDIR, HConstants.HREGION_OLDLOGDIR_NAME);
101
102  // Need to override this setup so we can edit the config before it gets sent
103  // to the HDFS & HBase cluster startup.
104  @BeforeClass
105  public static void setUpBeforeClass() throws Exception {
106    // Tweak default timeout values down for faster recovery
107    TEST_UTIL.getConfiguration().setInt("hbase.regionserver.logroll.errors.tolerated", 2);
108    TEST_UTIL.getConfiguration().setInt("hbase.rpc.timeout", 10 * 1000);
109
110    // Increase the amount of time between client retries
111    TEST_UTIL.getConfiguration().setLong("hbase.client.pause", 5 * 1000);
112
113    // lower the namenode & datanode heartbeat so the namenode
114    // quickly detects datanode failures
115    TEST_UTIL.getConfiguration().setInt("dfs.namenode.heartbeat.recheck-interval", 5000);
116    TEST_UTIL.getConfiguration().setInt("dfs.heartbeat.interval", 1);
117    // the namenode might still try to choose the recently-dead datanode
118    // for a pipeline, so try to a new pipeline multiple times
119    TEST_UTIL.getConfiguration().setInt("dfs.client.block.write.retries", 10);
120    TEST_UTIL.getConfiguration().set(WALFactory.WAL_PROVIDER, "asyncfs");
121  }
122
123  @Parameters(name = "{index}: walProvider={0}")
124  public static List<Object[]> params() {
125    return Arrays.asList(new Object[] { "filesystem" }, new Object[] { "asyncfs" });
126  }
127
128  private Configuration conf;
129  private FileSystem fs;
130
131  @Parameter
132  public String walProvider;
133
134  @Before
135  public void setUp() throws Exception {
136    TEST_UTIL.getConfiguration().set(WALFactory.WAL_PROVIDER, walProvider);
137    TEST_UTIL.startMiniCluster(2);
138
139    cluster = TEST_UTIL.getHBaseCluster();
140    dfsCluster = TEST_UTIL.getDFSCluster();
141    admin = TEST_UTIL.getAdmin();
142    conf = TEST_UTIL.getConfiguration();
143    fs = TEST_UTIL.getDFSCluster().getFileSystem();
144
145    // disable region rebalancing (interferes with log watching)
146    cluster.getMaster().balanceSwitch(false);
147    CommonFSUtils.setRootDir(conf, HBASEDIR);
148    CommonFSUtils.setWALRootDir(conf, HBASELOGDIR);
149  }
150
151  @After
152  public void tearDown() throws Exception {
153    TEST_UTIL.shutdownMiniCluster();
154  }
155
156  /**
157   * Tests that RegionServer aborts if we hit an error closing the WAL when there are unsynced WAL
158   * edits. See HBASE-4282.
159   */
160  @Test
161  public void testRSAbortWithUnflushedEdits() throws Exception {
162    LOG.info("Starting testRSAbortWithUnflushedEdits()");
163
164    // When the hbase:meta table can be opened, the region servers are running
165    TEST_UTIL.getConnection().getTable(TableName.META_TABLE_NAME).close();
166
167    // Create the test table and open it
168    TableName tableName = TableName.valueOf(this.getClass().getSimpleName());
169    TableDescriptor desc = TableDescriptorBuilder.newBuilder(tableName)
170      .setColumnFamily(ColumnFamilyDescriptorBuilder.of(HConstants.CATALOG_FAMILY)).build();
171
172    admin.createTable(desc);
173    Table table = TEST_UTIL.getConnection().getTable(tableName);
174    try {
175      HRegionServer server = TEST_UTIL.getRSForFirstRegionInTable(tableName);
176      WAL log = server.getWAL(null);
177
178      Put p = new Put(Bytes.toBytes("row2001"));
179      p.addColumn(HConstants.CATALOG_FAMILY, Bytes.toBytes("col"), Bytes.toBytes(2001));
180      table.put(p);
181
182      log.sync();
183
184      p = new Put(Bytes.toBytes("row2002"));
185      p.addColumn(HConstants.CATALOG_FAMILY, Bytes.toBytes("col"), Bytes.toBytes(2002));
186      table.put(p);
187
188      dfsCluster.restartDataNodes();
189      LOG.info("Restarted datanodes");
190
191      try {
192        log.rollWriter(true);
193      } catch (FailedLogCloseException flce) {
194        // Expected exception. We used to expect that there would be unsynced appends but this
195        // not reliable now that sync plays a roll in wall rolling. The above puts also now call
196        // sync.
197      } catch (Throwable t) {
198        LOG.error(HBaseMarkers.FATAL, "FAILED TEST: Got wrong exception", t);
199      }
200    } finally {
201      table.close();
202    }
203  }
204
205  /**
206   * Tests the case where a RegionServer enters a GC pause, comes back online after the master
207   * declared it dead and started to split. Want log rolling after a master split to fail. See
208   * HBASE-2312.
209   */
210  @Test
211  public void testLogRollAfterSplitStart() throws IOException {
212    LOG.info("Verify wal roll after split starts will fail.");
213    String logName =
214      ServerName.valueOf("testLogRollAfterSplitStart", 16010, EnvironmentEdgeManager.currentTime())
215        .toString();
216    Path thisTestsDir = new Path(HBASELOGDIR, AbstractFSWALProvider.getWALDirectoryName(logName));
217    final WALFactory wals = new WALFactory(conf, logName);
218
219    try {
220      // put some entries in an WAL
221      TableName tableName = TableName.valueOf(this.getClass().getName());
222      RegionInfo regionInfo = RegionInfoBuilder.newBuilder(tableName).build();
223      WAL log = wals.getWAL(regionInfo);
224      MultiVersionConcurrencyControl mvcc = new MultiVersionConcurrencyControl(1);
225
226      int total = 20;
227      for (int i = 0; i < total; i++) {
228        WALEdit kvs = new WALEdit();
229        WALEditInternalHelper.addExtendedCell(kvs,
230          new KeyValue(Bytes.toBytes(i), tableName.getName(), tableName.getName()));
231        NavigableMap<byte[], Integer> scopes = new TreeMap<>(Bytes.BYTES_COMPARATOR);
232        scopes.put(Bytes.toBytes("column"), 0);
233        log.appendData(regionInfo, new WALKeyImpl(regionInfo.getEncodedNameAsBytes(), tableName,
234          EnvironmentEdgeManager.currentTime(), mvcc, scopes), kvs);
235      }
236      // Send the data to HDFS datanodes and close the HDFS writer
237      log.sync();
238      closeWriter((AbstractFSWAL<?>) log);
239
240      // code taken from MasterFileSystem.getLogDirs(), which is called from
241      // MasterFileSystem.splitLog() handles RS shutdowns (as observed by the splitting process)
242      // rename the directory so a rogue RS doesn't create more WALs
243      Path rsSplitDir = thisTestsDir.suffix(AbstractFSWALProvider.SPLITTING_EXT);
244      if (!fs.rename(thisTestsDir, rsSplitDir)) {
245        throw new IOException("Failed fs.rename for log split: " + thisTestsDir);
246      }
247      LOG.debug("Renamed region directory: " + rsSplitDir);
248
249      LOG.debug("Processing the old log files.");
250      WALSplitter.split(HBASELOGDIR, rsSplitDir, OLDLOGDIR, fs, conf, wals);
251
252      LOG.debug("Trying to roll the WAL.");
253      IOException error = assertThrows(IOException.class, () -> log.rollWriter());
254      if (error instanceof RemoteException) {
255        error = ((RemoteException) error).unwrapRemoteException();
256      }
257      assertTrue("unexpected error: " + Throwables.getStackTraceAsString(error),
258        error instanceof FileNotFoundException
259          || error.getCause() instanceof FileNotFoundException);
260    } finally {
261      wals.close();
262      if (fs.exists(thisTestsDir)) {
263        fs.delete(thisTestsDir, true);
264      }
265    }
266  }
267
268  private <W extends WALProvider.WriterBase> void closeWriter(AbstractFSWAL<W> wal) {
269    wal.waitForSafePoint();
270    long oldFileLen = wal.writer.getLength();
271    wal.closeWriter(wal.writer, wal.getOldPath());
272    wal.logRollAndSetupWalProps(wal.getOldPath(), null, oldFileLen);
273    wal.writer = null;
274    wal.onWriterReplaced(null);
275    wal.rollRequested.set(false);
276  }
277}