001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.client;
019
020import static org.apache.hadoop.hbase.client.MetricsConnection.CLIENT_SIDE_METRICS_ENABLED_KEY;
021
022import java.io.IOException;
023import java.net.SocketTimeoutException;
024import java.util.ArrayList;
025import java.util.List;
026import org.apache.hadoop.conf.Configuration;
027import org.apache.hadoop.hbase.HBaseClassTestRule;
028import org.apache.hadoop.hbase.HBaseTestingUtility;
029import org.apache.hadoop.hbase.HConstants;
030import org.apache.hadoop.hbase.MiniHBaseCluster;
031import org.apache.hadoop.hbase.NotServingRegionException;
032import org.apache.hadoop.hbase.StartMiniClusterOption;
033import org.apache.hadoop.hbase.TableName;
034import org.apache.hadoop.hbase.ipc.CallTimeoutException;
035import org.apache.hadoop.hbase.regionserver.HRegionServer;
036import org.apache.hadoop.hbase.regionserver.RSRpcServices;
037import org.apache.hadoop.hbase.testclassification.ClientTests;
038import org.apache.hadoop.hbase.testclassification.MediumTests;
039import org.apache.hadoop.hbase.util.Bytes;
040import org.junit.AfterClass;
041import org.junit.Assert;
042import org.junit.Before;
043import org.junit.BeforeClass;
044import org.junit.ClassRule;
045import org.junit.Test;
046import org.junit.experimental.categories.Category;
047import org.junit.function.ThrowingRunnable;
048
049import org.apache.hbase.thirdparty.com.google.common.io.Closeables;
050import org.apache.hbase.thirdparty.com.google.protobuf.RpcController;
051import org.apache.hbase.thirdparty.com.google.protobuf.ServiceException;
052
053import org.apache.hadoop.hbase.shaded.protobuf.generated.ClientProtos;
054
055/**
056 * These tests verify that the RPC timeouts ('hbase.client.operation.timeout' and
057 * 'hbase.client.scanner.timeout.period') work correctly using a modified Region Server which
058 * injects delays to get, scan and mutate operations.
059 * <p/>
060 * When 'hbase.client.operation.timeout' is set and client operation is not completed in time the
061 * client will retry the operation 'hbase.client.retries.number' times. After that
062 * {@link SocketTimeoutException} will be thrown.
063 * <p/>
064 * Using 'hbase.client.scanner.timeout.period' configuration property similar behavior can be
065 * specified for scan related operations such as openScanner(), next(). If that times out
066 * {@link RetriesExhaustedException} will be thrown.
067 */
068@Category({ ClientTests.class, MediumTests.class })
069public class TestClientOperationTimeout {
070
071  @ClassRule
072  public static final HBaseClassTestRule CLASS_RULE =
073    HBaseClassTestRule.forClass(TestClientOperationTimeout.class);
074
075  private static final HBaseTestingUtility UTIL = new HBaseTestingUtility();
076
077  // Activate the delays after table creation to test get/scan/put
078  private static int DELAY_GET;
079  private static int DELAY_SCAN;
080  private static int DELAY_MUTATE;
081  private static int DELAY_BATCH;
082  private static int DELAY_META_SCAN;
083
084  private static boolean FAIL_BATCH = false;
085
086  private static final TableName TABLE_NAME = TableName.valueOf("Timeout");
087  private static final byte[] FAMILY = Bytes.toBytes("family");
088  private static final byte[] ROW = Bytes.toBytes("row");
089  private static final byte[] QUALIFIER = Bytes.toBytes("qualifier");
090  private static final byte[] VALUE = Bytes.toBytes("value");
091
092  private static Connection CONN;
093  private static Table TABLE;
094
095  @BeforeClass
096  public static void setUpClass() throws Exception {
097    // Set RegionServer class and use default values for other options.
098    StartMiniClusterOption option =
099      StartMiniClusterOption.builder().rsClass(DelayedRegionServer.class).build();
100    UTIL.startMiniCluster(option);
101    UTIL.getAdmin().createTable(TableDescriptorBuilder.newBuilder(TABLE_NAME)
102      .setColumnFamily(ColumnFamilyDescriptorBuilder.of(FAMILY)).build());
103
104    Configuration conf = new Configuration(UTIL.getConfiguration());
105    conf.setLong(HConstants.HBASE_CLIENT_OPERATION_TIMEOUT, 500);
106    conf.setLong(HConstants.HBASE_CLIENT_META_OPERATION_TIMEOUT, 500);
107    conf.setLong(HConstants.HBASE_CLIENT_SCANNER_TIMEOUT_PERIOD, 500);
108    conf.setLong(HConstants.HBASE_CLIENT_RETRIES_NUMBER, 1);
109    CONN = ConnectionFactory.createConnection(conf);
110    TABLE = CONN.getTable(TABLE_NAME);
111  }
112
113  @Before
114  public void setUp() throws Exception {
115    DELAY_GET = 0;
116    DELAY_SCAN = 0;
117    DELAY_MUTATE = 0;
118    DELAY_BATCH = 0;
119    DELAY_META_SCAN = 0;
120    FAIL_BATCH = false;
121  }
122
123  @AfterClass
124  public static void tearDown() throws Exception {
125    Closeables.close(TABLE, true);
126    Closeables.close(CONN, true);
127    UTIL.shutdownMiniCluster();
128  }
129
130  /**
131   * Tests that a get on a table throws {@link RetriesExhaustedException} when the operation takes
132   * longer than 'hbase.client.operation.timeout'.
133   */
134  @Test
135  public void testGetTimeout() {
136    DELAY_GET = 600;
137    try {
138      TABLE.get(new Get(ROW));
139      Assert.fail("should not reach here");
140    } catch (Exception e) {
141      Assert.assertTrue(
142        e instanceof SocketTimeoutException && e.getCause() instanceof CallTimeoutException);
143    }
144  }
145
146  /**
147   * Tests that a put on a table throws {@link RetriesExhaustedException} when the operation takes
148   * longer than 'hbase.client.operation.timeout'.
149   */
150  @Test
151  public void testPutTimeout() {
152    DELAY_MUTATE = 600;
153    Put put = new Put(ROW);
154    put.addColumn(FAMILY, QUALIFIER, VALUE);
155    try {
156      TABLE.put(put);
157      Assert.fail("should not reach here");
158    } catch (Exception e) {
159      Assert.assertTrue(
160        e instanceof SocketTimeoutException && e.getCause() instanceof CallTimeoutException);
161    }
162  }
163
164  /**
165   * Tests that a batch mutate and batch get on a table throws {@link SocketTimeoutException} or
166   * {@link OperationTimeoutExceededException} when the operation takes longer than
167   * 'hbase.client.operation.timeout'.
168   */
169  @Test
170  public void testMultiTimeout() {
171    DELAY_BATCH = 600;
172    Put put1 = new Put(ROW);
173    put1.addColumn(FAMILY, QUALIFIER, VALUE);
174    Put put2 = new Put(ROW);
175    put2.addColumn(FAMILY, QUALIFIER, VALUE);
176    List<Put> puts = new ArrayList<>();
177    puts.add(put1);
178    puts.add(put2);
179    assertMultiException(() -> TABLE.batch(puts, new Object[2]));
180
181    Get get1 = new Get(ROW);
182    get1.addColumn(FAMILY, QUALIFIER);
183    Get get2 = new Get(ROW);
184    get2.addColumn(FAMILY, QUALIFIER);
185
186    List<Get> gets = new ArrayList<>();
187    gets.add(get1);
188    gets.add(get2);
189    assertMultiException(() -> TABLE.batch(gets, new Object[2]));
190  }
191
192  /**
193   * AsyncProcess has an overall waitUntilDone with a timeout, and if all callables dont finish by
194   * then it throws a SocketTimeoutException. The callables themselves also try to honor the
195   * operation timeout and result in OperationTimeoutExceededException (wrapped in
196   * RetriesExhausted). The latter is the more user-friendly exception because it contains details
197   * about which server has issues, etc. For now we need to account for both because it's sort of a
198   * race to see which timeout exceeds first. Maybe we can replace the waitUntilDone behavior with
199   * an interrupt in the future so we can further unify.
200   */
201  private void assertMultiException(ThrowingRunnable runnable) {
202    IOException e = Assert.assertThrows(IOException.class, runnable);
203    if (e instanceof SocketTimeoutException) {
204      return;
205    }
206    Assert.assertTrue("Expected SocketTimeoutException or RetriesExhaustedWithDetailsException"
207      + " but was " + e.getClass(), e instanceof RetriesExhaustedWithDetailsException);
208    for (Throwable cause : ((RetriesExhaustedWithDetailsException) e).getCauses()) {
209      Assert.assertEquals(OperationTimeoutExceededException.class, cause.getClass());
210    }
211  }
212
213  /**
214   * Tests that a batch get on a table throws
215   * {@link org.apache.hadoop.hbase.client.OperationTimeoutExceededException} when the region lookup
216   * takes longer than the 'hbase.client.operation.timeout'. This specifically tests that when meta
217   * is slow, the fetching of region locations for a batch is not allowed to itself exceed the
218   * operation timeout. In a batch size of 100, it's possible to need to make 100 meta calls in
219   * sequence. If meta is slow, we should abort the request once the operation timeout is exceeded,
220   * even if we haven't finished locating all regions. See HBASE-27490
221   */
222  @Test
223  public void testMultiGetMetaTimeout() throws IOException {
224    Configuration conf = new Configuration(UTIL.getConfiguration());
225
226    conf.setLong(HConstants.HBASE_CLIENT_OPERATION_TIMEOUT, 400);
227    conf.setBoolean(CLIENT_SIDE_METRICS_ENABLED_KEY, true);
228    try (Connection specialConnection = ConnectionFactory.createConnection(conf);
229      Table specialTable = specialConnection.getTable(TABLE_NAME)) {
230
231      MetricsConnection metrics =
232        ((ConnectionImplementation) specialConnection).getConnectionMetrics();
233      long metaCacheNumClearServerPreFailure = metrics.getMetaCacheNumClearServer().getCount();
234
235      // delay and timeout are the same, so we should see a timeout after the first region lookup
236      DELAY_META_SCAN = 400;
237
238      List<Get> gets = new ArrayList<>();
239      // we need to ensure the region look-ups eat up more time than the operation timeout without
240      // exceeding the scan timeout.
241      for (int i = 0; i < 100; i++) {
242        gets.add(new Get(Bytes.toBytes(i)).addColumn(FAMILY, QUALIFIER));
243      }
244      try {
245        specialTable.get(gets);
246        Assert.fail("should not reach here");
247      } catch (Exception e) {
248        RetriesExhaustedWithDetailsException expected = (RetriesExhaustedWithDetailsException) e;
249        Assert.assertEquals(100, expected.getNumExceptions());
250
251        // verify we do not clear the cache in this situation otherwise we will create pathological
252        // feedback loop with multigets See: HBASE-27487
253        long metaCacheNumClearServerPostFailure = metrics.getMetaCacheNumClearServer().getCount();
254        Assert.assertEquals(metaCacheNumClearServerPreFailure, metaCacheNumClearServerPostFailure);
255
256        for (Throwable cause : expected.getCauses()) {
257          Assert.assertTrue(cause instanceof OperationTimeoutExceededException);
258          // Check that this is the timeout thrown by AsyncRequestFutureImpl during region lookup
259          Assert.assertTrue(cause.getMessage().contains("Operation timeout exceeded during"));
260        }
261      }
262    }
263  }
264
265  /**
266   * Tests that a batch get on a table throws
267   * {@link org.apache.hadoop.hbase.client.OperationTimeoutExceededException} when retries are tuned
268   * too high to be able to be processed within the operation timeout. In this case, the final
269   * OperationTimeoutExceededException should not trigger a cache clear (but the individual failures
270   * may, if appropriate). This test skirts around the timeout checks during meta lookups from
271   * HBASE-27490, because we want to test for the case where meta lookups were able to succeed in
272   * time but did not leave enough time for the actual calls to occur. See HBASE-27487
273   */
274  @Test
275  public void testMultiGetRetryTimeout() {
276    Configuration conf = new Configuration(UTIL.getConfiguration());
277
278    // allow 1 retry, and 0 backoff
279    conf.setLong(HConstants.HBASE_CLIENT_OPERATION_TIMEOUT, 500);
280    conf.setInt(HConstants.HBASE_CLIENT_RETRIES_NUMBER, 1);
281    conf.setLong(HConstants.HBASE_CLIENT_PAUSE, 0);
282    conf.setBoolean(CLIENT_SIDE_METRICS_ENABLED_KEY, true);
283
284    try (Connection specialConnection = ConnectionFactory.createConnection(conf);
285      Table specialTable = specialConnection.getTable(TABLE_NAME)) {
286
287      MetricsConnection metrics =
288        ((ConnectionImplementation) specialConnection).getConnectionMetrics();
289      long metaCacheNumClearServerPreFailure = metrics.getMetaCacheNumClearRegion().getCount();
290
291      // meta scan should take up most of the timeout but not all
292      DELAY_META_SCAN = 300;
293      // fail the batch call, causing a retry
294      FAIL_BATCH = true;
295
296      // Use a batch size of 1 so that we only make 1 meta call per attempt
297      List<Get> gets = new ArrayList<>();
298      gets.add(new Get(Bytes.toBytes(0)).addColumn(FAMILY, QUALIFIER));
299
300      try {
301        specialTable.batch(gets, new Object[1]);
302        Assert.fail("should not reach here");
303      } catch (Exception e) {
304        RetriesExhaustedWithDetailsException expected = (RetriesExhaustedWithDetailsException) e;
305        Assert.assertEquals(1, expected.getNumExceptions());
306
307        // We expect that the error caused by FAIL_BATCH would clear the meta cache but
308        // the OperationTimeoutExceededException should not. So only allow new cache clear here
309        long metaCacheNumClearServerPostFailure = metrics.getMetaCacheNumClearRegion().getCount();
310        Assert.assertEquals(metaCacheNumClearServerPreFailure + 1,
311          metaCacheNumClearServerPostFailure);
312
313        for (Throwable cause : expected.getCauses()) {
314          Assert.assertTrue(cause instanceof OperationTimeoutExceededException);
315          // Check that this is the timeout thrown by CancellableRegionServerCallable
316          Assert.assertTrue(cause.getMessage().contains("Timeout exceeded before call began"));
317        }
318      }
319    } catch (IOException e) {
320      throw new RuntimeException(e);
321    }
322  }
323
324  /**
325   * Tests that scan on a table throws {@link RetriesExhaustedException} when the operation takes
326   * longer than 'hbase.client.scanner.timeout.period'.
327   */
328  @Test
329  public void testScanTimeout() {
330    DELAY_SCAN = 600;
331    try {
332      ResultScanner scanner = TABLE.getScanner(new Scan());
333      scanner.next();
334      Assert.fail("should not reach here");
335    } catch (Exception e) {
336      Assert.assertTrue(
337        e instanceof RetriesExhaustedException && e.getCause() instanceof SocketTimeoutException);
338    }
339  }
340
341  private static class DelayedRegionServer extends MiniHBaseCluster.MiniHBaseClusterRegionServer {
342    public DelayedRegionServer(Configuration conf) throws IOException, InterruptedException {
343      super(conf);
344    }
345
346    @Override
347    protected RSRpcServices createRpcServices() throws IOException {
348      return new DelayedRSRpcServices(this);
349    }
350  }
351
352  /**
353   * This {@link RSRpcServices} class injects delay for Rpc calls and after executes super methods.
354   */
355  public static class DelayedRSRpcServices extends RSRpcServices {
356    DelayedRSRpcServices(HRegionServer rs) throws IOException {
357      super(rs);
358    }
359
360    @Override
361    public ClientProtos.GetResponse get(RpcController controller, ClientProtos.GetRequest request)
362      throws ServiceException {
363      try {
364        Thread.sleep(DELAY_GET);
365      } catch (InterruptedException e) {
366        LOG.error("Sleep interrupted during get operation", e);
367      }
368      return super.get(controller, request);
369    }
370
371    @Override
372    public ClientProtos.MutateResponse mutate(RpcController rpcc,
373      ClientProtos.MutateRequest request) throws ServiceException {
374      try {
375        Thread.sleep(DELAY_MUTATE);
376      } catch (InterruptedException e) {
377        LOG.error("Sleep interrupted during mutate operation", e);
378      }
379      return super.mutate(rpcc, request);
380    }
381
382    @Override
383    public ClientProtos.ScanResponse scan(RpcController controller,
384      ClientProtos.ScanRequest request) throws ServiceException {
385      try {
386        String regionName = Bytes.toString(request.getRegion().getValue().toByteArray());
387        if (regionName.contains(TableName.META_TABLE_NAME.getNameAsString())) {
388          Thread.sleep(DELAY_META_SCAN);
389        } else {
390          Thread.sleep(DELAY_SCAN);
391        }
392      } catch (InterruptedException e) {
393        LOG.error("Sleep interrupted during scan operation", e);
394      }
395      return super.scan(controller, request);
396    }
397
398    @Override
399    public ClientProtos.MultiResponse multi(RpcController rpcc, ClientProtos.MultiRequest request)
400      throws ServiceException {
401      try {
402        if (FAIL_BATCH) {
403          throw new ServiceException(new NotServingRegionException("simulated failure"));
404        }
405        Thread.sleep(DELAY_BATCH);
406      } catch (InterruptedException e) {
407        LOG.error("Sleep interrupted during multi operation", e);
408      }
409      return super.multi(rpcc, request);
410    }
411  }
412}