001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.client; 019 020import static org.apache.hadoop.hbase.client.MetricsConnection.CLIENT_SIDE_METRICS_ENABLED_KEY; 021 022import java.io.IOException; 023import java.net.SocketTimeoutException; 024import java.util.ArrayList; 025import java.util.List; 026import org.apache.hadoop.conf.Configuration; 027import org.apache.hadoop.hbase.HBaseClassTestRule; 028import org.apache.hadoop.hbase.HBaseTestingUtility; 029import org.apache.hadoop.hbase.HConstants; 030import org.apache.hadoop.hbase.MiniHBaseCluster; 031import org.apache.hadoop.hbase.NotServingRegionException; 032import org.apache.hadoop.hbase.StartMiniClusterOption; 033import org.apache.hadoop.hbase.TableName; 034import org.apache.hadoop.hbase.ipc.CallTimeoutException; 035import org.apache.hadoop.hbase.regionserver.HRegionServer; 036import org.apache.hadoop.hbase.regionserver.RSRpcServices; 037import org.apache.hadoop.hbase.testclassification.ClientTests; 038import org.apache.hadoop.hbase.testclassification.MediumTests; 039import org.apache.hadoop.hbase.util.Bytes; 040import org.junit.AfterClass; 041import org.junit.Assert; 042import org.junit.Before; 043import org.junit.BeforeClass; 044import org.junit.ClassRule; 045import org.junit.Test; 046import org.junit.experimental.categories.Category; 047import org.junit.function.ThrowingRunnable; 048 049import org.apache.hbase.thirdparty.com.google.common.io.Closeables; 050import org.apache.hbase.thirdparty.com.google.protobuf.RpcController; 051import org.apache.hbase.thirdparty.com.google.protobuf.ServiceException; 052 053import org.apache.hadoop.hbase.shaded.protobuf.generated.ClientProtos; 054 055/** 056 * These tests verify that the RPC timeouts ('hbase.client.operation.timeout' and 057 * 'hbase.client.scanner.timeout.period') work correctly using a modified Region Server which 058 * injects delays to get, scan and mutate operations. 059 * <p/> 060 * When 'hbase.client.operation.timeout' is set and client operation is not completed in time the 061 * client will retry the operation 'hbase.client.retries.number' times. After that 062 * {@link SocketTimeoutException} will be thrown. 063 * <p/> 064 * Using 'hbase.client.scanner.timeout.period' configuration property similar behavior can be 065 * specified for scan related operations such as openScanner(), next(). If that times out 066 * {@link RetriesExhaustedException} will be thrown. 067 */ 068@Category({ ClientTests.class, MediumTests.class }) 069public class TestClientOperationTimeout { 070 071 @ClassRule 072 public static final HBaseClassTestRule CLASS_RULE = 073 HBaseClassTestRule.forClass(TestClientOperationTimeout.class); 074 075 private static final HBaseTestingUtility UTIL = new HBaseTestingUtility(); 076 077 // Activate the delays after table creation to test get/scan/put 078 private static int DELAY_GET; 079 private static int DELAY_SCAN; 080 private static int DELAY_MUTATE; 081 private static int DELAY_BATCH; 082 private static int DELAY_META_SCAN; 083 084 private static boolean FAIL_BATCH = false; 085 086 private static final TableName TABLE_NAME = TableName.valueOf("Timeout"); 087 private static final byte[] FAMILY = Bytes.toBytes("family"); 088 private static final byte[] ROW = Bytes.toBytes("row"); 089 private static final byte[] QUALIFIER = Bytes.toBytes("qualifier"); 090 private static final byte[] VALUE = Bytes.toBytes("value"); 091 092 private static Connection CONN; 093 private static Table TABLE; 094 095 @BeforeClass 096 public static void setUpClass() throws Exception { 097 // Set RegionServer class and use default values for other options. 098 StartMiniClusterOption option = 099 StartMiniClusterOption.builder().rsClass(DelayedRegionServer.class).build(); 100 UTIL.startMiniCluster(option); 101 UTIL.getAdmin().createTable(TableDescriptorBuilder.newBuilder(TABLE_NAME) 102 .setColumnFamily(ColumnFamilyDescriptorBuilder.of(FAMILY)).build()); 103 104 Configuration conf = new Configuration(UTIL.getConfiguration()); 105 conf.setLong(HConstants.HBASE_CLIENT_OPERATION_TIMEOUT, 500); 106 conf.setLong(HConstants.HBASE_CLIENT_META_OPERATION_TIMEOUT, 500); 107 conf.setLong(HConstants.HBASE_CLIENT_SCANNER_TIMEOUT_PERIOD, 500); 108 conf.setLong(HConstants.HBASE_CLIENT_RETRIES_NUMBER, 1); 109 CONN = ConnectionFactory.createConnection(conf); 110 TABLE = CONN.getTable(TABLE_NAME); 111 } 112 113 @Before 114 public void setUp() throws Exception { 115 DELAY_GET = 0; 116 DELAY_SCAN = 0; 117 DELAY_MUTATE = 0; 118 DELAY_BATCH = 0; 119 DELAY_META_SCAN = 0; 120 FAIL_BATCH = false; 121 } 122 123 @AfterClass 124 public static void tearDown() throws Exception { 125 Closeables.close(TABLE, true); 126 Closeables.close(CONN, true); 127 UTIL.shutdownMiniCluster(); 128 } 129 130 /** 131 * Tests that a get on a table throws {@link RetriesExhaustedException} when the operation takes 132 * longer than 'hbase.client.operation.timeout'. 133 */ 134 @Test 135 public void testGetTimeout() { 136 DELAY_GET = 600; 137 try { 138 TABLE.get(new Get(ROW)); 139 Assert.fail("should not reach here"); 140 } catch (Exception e) { 141 Assert.assertTrue( 142 e instanceof SocketTimeoutException && e.getCause() instanceof CallTimeoutException); 143 } 144 } 145 146 /** 147 * Tests that a put on a table throws {@link RetriesExhaustedException} when the operation takes 148 * longer than 'hbase.client.operation.timeout'. 149 */ 150 @Test 151 public void testPutTimeout() { 152 DELAY_MUTATE = 600; 153 Put put = new Put(ROW); 154 put.addColumn(FAMILY, QUALIFIER, VALUE); 155 try { 156 TABLE.put(put); 157 Assert.fail("should not reach here"); 158 } catch (Exception e) { 159 Assert.assertTrue( 160 e instanceof SocketTimeoutException && e.getCause() instanceof CallTimeoutException); 161 } 162 } 163 164 /** 165 * Tests that a batch mutate and batch get on a table throws {@link SocketTimeoutException} or 166 * {@link OperationTimeoutExceededException} when the operation takes longer than 167 * 'hbase.client.operation.timeout'. 168 */ 169 @Test 170 public void testMultiTimeout() { 171 DELAY_BATCH = 600; 172 Put put1 = new Put(ROW); 173 put1.addColumn(FAMILY, QUALIFIER, VALUE); 174 Put put2 = new Put(ROW); 175 put2.addColumn(FAMILY, QUALIFIER, VALUE); 176 List<Put> puts = new ArrayList<>(); 177 puts.add(put1); 178 puts.add(put2); 179 assertMultiException(() -> TABLE.batch(puts, new Object[2])); 180 181 Get get1 = new Get(ROW); 182 get1.addColumn(FAMILY, QUALIFIER); 183 Get get2 = new Get(ROW); 184 get2.addColumn(FAMILY, QUALIFIER); 185 186 List<Get> gets = new ArrayList<>(); 187 gets.add(get1); 188 gets.add(get2); 189 assertMultiException(() -> TABLE.batch(gets, new Object[2])); 190 } 191 192 /** 193 * AsyncProcess has an overall waitUntilDone with a timeout, and if all callables dont finish by 194 * then it throws a SocketTimeoutException. The callables themselves also try to honor the 195 * operation timeout and result in OperationTimeoutExceededException (wrapped in 196 * RetriesExhausted). The latter is the more user-friendly exception because it contains details 197 * about which server has issues, etc. For now we need to account for both because it's sort of a 198 * race to see which timeout exceeds first. Maybe we can replace the waitUntilDone behavior with 199 * an interrupt in the future so we can further unify. 200 */ 201 private void assertMultiException(ThrowingRunnable runnable) { 202 IOException e = Assert.assertThrows(IOException.class, runnable); 203 if (e instanceof SocketTimeoutException) { 204 return; 205 } 206 Assert.assertTrue("Expected SocketTimeoutException or RetriesExhaustedWithDetailsException" 207 + " but was " + e.getClass(), e instanceof RetriesExhaustedWithDetailsException); 208 for (Throwable cause : ((RetriesExhaustedWithDetailsException) e).getCauses()) { 209 Assert.assertEquals(OperationTimeoutExceededException.class, cause.getClass()); 210 } 211 } 212 213 /** 214 * Tests that a batch get on a table throws 215 * {@link org.apache.hadoop.hbase.client.OperationTimeoutExceededException} when the region lookup 216 * takes longer than the 'hbase.client.operation.timeout'. This specifically tests that when meta 217 * is slow, the fetching of region locations for a batch is not allowed to itself exceed the 218 * operation timeout. In a batch size of 100, it's possible to need to make 100 meta calls in 219 * sequence. If meta is slow, we should abort the request once the operation timeout is exceeded, 220 * even if we haven't finished locating all regions. See HBASE-27490 221 */ 222 @Test 223 public void testMultiGetMetaTimeout() throws IOException { 224 Configuration conf = new Configuration(UTIL.getConfiguration()); 225 226 conf.setLong(HConstants.HBASE_CLIENT_OPERATION_TIMEOUT, 400); 227 conf.setBoolean(CLIENT_SIDE_METRICS_ENABLED_KEY, true); 228 try (Connection specialConnection = ConnectionFactory.createConnection(conf); 229 Table specialTable = specialConnection.getTable(TABLE_NAME)) { 230 231 MetricsConnection metrics = 232 ((ConnectionImplementation) specialConnection).getConnectionMetrics(); 233 long metaCacheNumClearServerPreFailure = metrics.getMetaCacheNumClearServer().getCount(); 234 235 // delay and timeout are the same, so we should see a timeout after the first region lookup 236 DELAY_META_SCAN = 400; 237 238 List<Get> gets = new ArrayList<>(); 239 // we need to ensure the region look-ups eat up more time than the operation timeout without 240 // exceeding the scan timeout. 241 for (int i = 0; i < 100; i++) { 242 gets.add(new Get(Bytes.toBytes(i)).addColumn(FAMILY, QUALIFIER)); 243 } 244 try { 245 specialTable.get(gets); 246 Assert.fail("should not reach here"); 247 } catch (Exception e) { 248 RetriesExhaustedWithDetailsException expected = (RetriesExhaustedWithDetailsException) e; 249 Assert.assertEquals(100, expected.getNumExceptions()); 250 251 // verify we do not clear the cache in this situation otherwise we will create pathological 252 // feedback loop with multigets See: HBASE-27487 253 long metaCacheNumClearServerPostFailure = metrics.getMetaCacheNumClearServer().getCount(); 254 Assert.assertEquals(metaCacheNumClearServerPreFailure, metaCacheNumClearServerPostFailure); 255 256 for (Throwable cause : expected.getCauses()) { 257 Assert.assertTrue(cause instanceof OperationTimeoutExceededException); 258 // Check that this is the timeout thrown by AsyncRequestFutureImpl during region lookup 259 Assert.assertTrue(cause.getMessage().contains("Operation timeout exceeded during")); 260 } 261 } 262 } 263 } 264 265 /** 266 * Tests that a batch get on a table throws 267 * {@link org.apache.hadoop.hbase.client.OperationTimeoutExceededException} when retries are tuned 268 * too high to be able to be processed within the operation timeout. In this case, the final 269 * OperationTimeoutExceededException should not trigger a cache clear (but the individual failures 270 * may, if appropriate). This test skirts around the timeout checks during meta lookups from 271 * HBASE-27490, because we want to test for the case where meta lookups were able to succeed in 272 * time but did not leave enough time for the actual calls to occur. See HBASE-27487 273 */ 274 @Test 275 public void testMultiGetRetryTimeout() { 276 Configuration conf = new Configuration(UTIL.getConfiguration()); 277 278 // allow 1 retry, and 0 backoff 279 conf.setLong(HConstants.HBASE_CLIENT_OPERATION_TIMEOUT, 500); 280 conf.setInt(HConstants.HBASE_CLIENT_RETRIES_NUMBER, 1); 281 conf.setLong(HConstants.HBASE_CLIENT_PAUSE, 0); 282 conf.setBoolean(CLIENT_SIDE_METRICS_ENABLED_KEY, true); 283 284 try (Connection specialConnection = ConnectionFactory.createConnection(conf); 285 Table specialTable = specialConnection.getTable(TABLE_NAME)) { 286 287 MetricsConnection metrics = 288 ((ConnectionImplementation) specialConnection).getConnectionMetrics(); 289 long metaCacheNumClearServerPreFailure = metrics.getMetaCacheNumClearRegion().getCount(); 290 291 // meta scan should take up most of the timeout but not all 292 DELAY_META_SCAN = 300; 293 // fail the batch call, causing a retry 294 FAIL_BATCH = true; 295 296 // Use a batch size of 1 so that we only make 1 meta call per attempt 297 List<Get> gets = new ArrayList<>(); 298 gets.add(new Get(Bytes.toBytes(0)).addColumn(FAMILY, QUALIFIER)); 299 300 try { 301 specialTable.batch(gets, new Object[1]); 302 Assert.fail("should not reach here"); 303 } catch (Exception e) { 304 RetriesExhaustedWithDetailsException expected = (RetriesExhaustedWithDetailsException) e; 305 Assert.assertEquals(1, expected.getNumExceptions()); 306 307 // We expect that the error caused by FAIL_BATCH would clear the meta cache but 308 // the OperationTimeoutExceededException should not. So only allow new cache clear here 309 long metaCacheNumClearServerPostFailure = metrics.getMetaCacheNumClearRegion().getCount(); 310 Assert.assertEquals(metaCacheNumClearServerPreFailure + 1, 311 metaCacheNumClearServerPostFailure); 312 313 for (Throwable cause : expected.getCauses()) { 314 Assert.assertTrue(cause instanceof OperationTimeoutExceededException); 315 // Check that this is the timeout thrown by CancellableRegionServerCallable 316 Assert.assertTrue(cause.getMessage().contains("Timeout exceeded before call began")); 317 } 318 } 319 } catch (IOException e) { 320 throw new RuntimeException(e); 321 } 322 } 323 324 /** 325 * Tests that scan on a table throws {@link RetriesExhaustedException} when the operation takes 326 * longer than 'hbase.client.scanner.timeout.period'. 327 */ 328 @Test 329 public void testScanTimeout() { 330 DELAY_SCAN = 600; 331 try { 332 ResultScanner scanner = TABLE.getScanner(new Scan()); 333 scanner.next(); 334 Assert.fail("should not reach here"); 335 } catch (Exception e) { 336 Assert.assertTrue( 337 e instanceof RetriesExhaustedException && e.getCause() instanceof SocketTimeoutException); 338 } 339 } 340 341 private static class DelayedRegionServer extends MiniHBaseCluster.MiniHBaseClusterRegionServer { 342 public DelayedRegionServer(Configuration conf) throws IOException, InterruptedException { 343 super(conf); 344 } 345 346 @Override 347 protected RSRpcServices createRpcServices() throws IOException { 348 return new DelayedRSRpcServices(this); 349 } 350 } 351 352 /** 353 * This {@link RSRpcServices} class injects delay for Rpc calls and after executes super methods. 354 */ 355 public static class DelayedRSRpcServices extends RSRpcServices { 356 DelayedRSRpcServices(HRegionServer rs) throws IOException { 357 super(rs); 358 } 359 360 @Override 361 public ClientProtos.GetResponse get(RpcController controller, ClientProtos.GetRequest request) 362 throws ServiceException { 363 try { 364 Thread.sleep(DELAY_GET); 365 } catch (InterruptedException e) { 366 LOG.error("Sleep interrupted during get operation", e); 367 } 368 return super.get(controller, request); 369 } 370 371 @Override 372 public ClientProtos.MutateResponse mutate(RpcController rpcc, 373 ClientProtos.MutateRequest request) throws ServiceException { 374 try { 375 Thread.sleep(DELAY_MUTATE); 376 } catch (InterruptedException e) { 377 LOG.error("Sleep interrupted during mutate operation", e); 378 } 379 return super.mutate(rpcc, request); 380 } 381 382 @Override 383 public ClientProtos.ScanResponse scan(RpcController controller, 384 ClientProtos.ScanRequest request) throws ServiceException { 385 try { 386 String regionName = Bytes.toString(request.getRegion().getValue().toByteArray()); 387 if (regionName.contains(TableName.META_TABLE_NAME.getNameAsString())) { 388 Thread.sleep(DELAY_META_SCAN); 389 } else { 390 Thread.sleep(DELAY_SCAN); 391 } 392 } catch (InterruptedException e) { 393 LOG.error("Sleep interrupted during scan operation", e); 394 } 395 return super.scan(controller, request); 396 } 397 398 @Override 399 public ClientProtos.MultiResponse multi(RpcController rpcc, ClientProtos.MultiRequest request) 400 throws ServiceException { 401 try { 402 if (FAIL_BATCH) { 403 throw new ServiceException(new NotServingRegionException("simulated failure")); 404 } 405 Thread.sleep(DELAY_BATCH); 406 } catch (InterruptedException e) { 407 LOG.error("Sleep interrupted during multi operation", e); 408 } 409 return super.multi(rpcc, request); 410 } 411 } 412}