001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.regionserver; 019 020import static org.junit.Assert.assertEquals; 021import static org.junit.Assert.assertFalse; 022import static org.junit.Assert.assertTrue; 023 024import java.io.IOException; 025import java.util.ArrayList; 026import java.util.Collection; 027import java.util.Collections; 028import java.util.HashMap; 029import java.util.HashSet; 030import java.util.List; 031import java.util.Map; 032import java.util.Random; 033import java.util.Set; 034import java.util.TreeSet; 035import java.util.concurrent.ThreadLocalRandom; 036import org.apache.hadoop.hbase.Cell; 037import org.apache.hadoop.hbase.CellComparatorImpl; 038import org.apache.hadoop.hbase.CellUtil; 039import org.apache.hadoop.hbase.ExtendedCell; 040import org.apache.hadoop.hbase.HBaseTestingUtil; 041import org.apache.hadoop.hbase.KeyValue; 042import org.apache.hadoop.hbase.KeyValueTestUtil; 043import org.apache.hadoop.hbase.PrivateCellUtil; 044import org.apache.hadoop.hbase.client.ColumnFamilyDescriptorBuilder; 045import org.apache.hadoop.hbase.client.Delete; 046import org.apache.hadoop.hbase.client.Put; 047import org.apache.hadoop.hbase.client.Scan; 048import org.apache.hadoop.hbase.io.compress.Compression; 049import org.apache.hadoop.hbase.io.encoding.DataBlockEncoding; 050import org.apache.hadoop.hbase.io.hfile.BlockCacheFactory; 051import org.apache.hadoop.hbase.util.BloomFilterUtil; 052import org.apache.hadoop.hbase.util.Bytes; 053import org.junit.Test; 054import org.junit.runners.Parameterized.Parameter; 055import org.slf4j.Logger; 056import org.slf4j.LoggerFactory; 057 058/** 059 * Tests optimized scanning of multiple columns. <br> 060 * We separated the big test into several sub-class UT, because When in ROWCOL bloom type, we will 061 * test the row-col bloom filter frequently for saving HDFS seek once we switch from one column to 062 * another in our UT. It's cpu time consuming (~45s for each case), so moved the ROWCOL case into a 063 * separated LargeTests to avoid timeout failure. <br> 064 * <br> 065 * To be clear: In TestMultiColumnScanner, we will flush 10 (NUM_FLUSHES=10) HFiles here, and the 066 * table will put ~1000 cells (rows=20, ts=6, qualifiers=8, total=20*6*8 ~ 1000) . Each full table 067 * scan will check the ROWCOL bloom filter 20 (rows)* 8 (column) * 10 (hfiles)= 1600 times, beside 068 * it will scan the full table 6*2^8=1536 times, so finally will have 1600*1536=2457600 bloom filter 069 * testing. (See HBASE-21520) 070 */ 071public abstract class TestMultiColumnScanner { 072 073 private static final Logger LOG = LoggerFactory.getLogger(TestMultiColumnScanner.class); 074 075 private static final String TABLE_NAME = TestMultiColumnScanner.class.getSimpleName(); 076 077 static final int MAX_VERSIONS = 50; 078 079 private static final String FAMILY = "CF"; 080 private static final byte[] FAMILY_BYTES = Bytes.toBytes(FAMILY); 081 082 /** 083 * The size of the column qualifier set used. Increasing this parameter exponentially increases 084 * test time. 085 */ 086 private static final int NUM_COLUMNS = 8; 087 088 private static final int MAX_COLUMN_BIT_MASK = 1 << NUM_COLUMNS - 1; 089 private static final int NUM_FLUSHES = 10; 090 private static final int NUM_ROWS = 20; 091 092 /** A large value of type long for use as a timestamp */ 093 private static final long BIG_LONG = 9111222333444555666L; 094 095 /** 096 * Timestamps to test with. Cannot use {@link Long#MAX_VALUE} here, because it will be replaced by 097 * an timestamp auto-generated based on the time. 098 */ 099 private static final long[] TIMESTAMPS = 100 new long[] { 1, 3, 5, Integer.MAX_VALUE, BIG_LONG, Long.MAX_VALUE - 1 }; 101 102 /** The probability that a column is skipped in a store file. */ 103 private static final double COLUMN_SKIP_IN_STORE_FILE_PROB = 0.7; 104 105 /** The probability to delete a row/column pair */ 106 private static final double DELETE_PROBABILITY = 0.02; 107 108 private final static HBaseTestingUtil TEST_UTIL = new HBaseTestingUtil(); 109 110 @Parameter(0) 111 public Compression.Algorithm comprAlgo; 112 113 @Parameter(1) 114 public BloomType bloomType; 115 116 @Parameter(2) 117 public DataBlockEncoding dataBlockEncoding; 118 119 // Some static sanity-checking. 120 static { 121 assertTrue(BIG_LONG > 0.9 * Long.MAX_VALUE); // Guard against typos. 122 123 // Ensure TIMESTAMPS are sorted. 124 for (int i = 0; i < TIMESTAMPS.length - 1; ++i) 125 assertTrue(TIMESTAMPS[i] < TIMESTAMPS[i + 1]); 126 } 127 128 public static Collection<Object[]> generateParams(Compression.Algorithm algo, 129 boolean useDataBlockEncoding) { 130 List<Object[]> parameters = new ArrayList<>(); 131 for (BloomType bloomType : BloomType.values()) { 132 DataBlockEncoding dataBlockEncoding = 133 useDataBlockEncoding ? DataBlockEncoding.PREFIX : DataBlockEncoding.NONE; 134 parameters.add(new Object[] { algo, bloomType, dataBlockEncoding }); 135 } 136 return parameters; 137 } 138 139 @Test 140 public void testMultiColumnScanner() throws IOException { 141 TEST_UTIL.getConfiguration().setInt(BloomFilterUtil.PREFIX_LENGTH_KEY, 10); 142 HRegion region = TEST_UTIL.createTestRegion(TABLE_NAME, 143 ColumnFamilyDescriptorBuilder.newBuilder(FAMILY_BYTES).setCompressionType(comprAlgo) 144 .setBloomFilterType(bloomType).setMaxVersions(MAX_VERSIONS) 145 .setDataBlockEncoding(dataBlockEncoding).build(), 146 BlockCacheFactory.createBlockCache(TEST_UTIL.getConfiguration())); 147 List<String> rows = sequentialStrings("row", NUM_ROWS); 148 List<String> qualifiers = sequentialStrings("qual", NUM_COLUMNS); 149 List<KeyValue> kvs = new ArrayList<>(); 150 Set<String> keySet = new HashSet<>(); 151 152 // A map from <row>_<qualifier> to the most recent delete timestamp for 153 // that column. 154 Map<String, Long> lastDelTimeMap = new HashMap<>(); 155 156 Random rand = ThreadLocalRandom.current(); 157 for (int iFlush = 0; iFlush < NUM_FLUSHES; ++iFlush) { 158 for (String qual : qualifiers) { 159 // This is where we decide to include or not include this column into 160 // this store file, regardless of row and timestamp. 161 if (rand.nextDouble() < COLUMN_SKIP_IN_STORE_FILE_PROB) continue; 162 163 byte[] qualBytes = Bytes.toBytes(qual); 164 for (String row : rows) { 165 Put p = new Put(Bytes.toBytes(row)); 166 for (long ts : TIMESTAMPS) { 167 String value = createValue(row, qual, ts); 168 KeyValue kv = KeyValueTestUtil.create(row, FAMILY, qual, ts, value); 169 assertEquals(kv.getTimestamp(), ts); 170 p.add(kv); 171 String keyAsString = kv.toString(); 172 if (!keySet.contains(keyAsString)) { 173 keySet.add(keyAsString); 174 kvs.add(kv); 175 } 176 } 177 region.put(p); 178 179 Delete d = new Delete(Bytes.toBytes(row)); 180 boolean deletedSomething = false; 181 for (long ts : TIMESTAMPS) 182 if (rand.nextDouble() < DELETE_PROBABILITY) { 183 d.addColumns(FAMILY_BYTES, qualBytes, ts); 184 String rowAndQual = row + "_" + qual; 185 Long whenDeleted = lastDelTimeMap.get(rowAndQual); 186 lastDelTimeMap.put(rowAndQual, whenDeleted == null ? ts : Math.max(ts, whenDeleted)); 187 deletedSomething = true; 188 } 189 if (deletedSomething) region.delete(d); 190 } 191 } 192 region.flush(true); 193 } 194 195 Collections.sort(kvs, CellComparatorImpl.COMPARATOR); 196 for (int maxVersions = 1; maxVersions <= TIMESTAMPS.length; ++maxVersions) { 197 for (int columnBitMask = 1; columnBitMask <= MAX_COLUMN_BIT_MASK; ++columnBitMask) { 198 Scan scan = new Scan(); 199 scan.readVersions(maxVersions); 200 Set<String> qualSet = new TreeSet<>(); 201 { 202 int columnMaskTmp = columnBitMask; 203 for (String qual : qualifiers) { 204 if ((columnMaskTmp & 1) != 0) { 205 scan.addColumn(FAMILY_BYTES, Bytes.toBytes(qual)); 206 qualSet.add(qual); 207 } 208 columnMaskTmp >>= 1; 209 } 210 assertEquals(0, columnMaskTmp); 211 } 212 213 InternalScanner scanner = region.getScanner(scan); 214 List<ExtendedCell> results = new ArrayList<>(); 215 216 int kvPos = 0; 217 int numResults = 0; 218 String queryInfo = "columns queried: " + qualSet + " (columnBitMask=" + columnBitMask 219 + "), maxVersions=" + maxVersions; 220 221 while (scanner.next(results) || results.size() > 0) { 222 for (ExtendedCell kv : results) { 223 while ( 224 kvPos < kvs.size() 225 && !matchesQuery(kvs.get(kvPos), qualSet, maxVersions, lastDelTimeMap) 226 ) { 227 ++kvPos; 228 } 229 String rowQual = getRowQualStr(kv); 230 String deleteInfo = ""; 231 Long lastDelTS = lastDelTimeMap.get(rowQual); 232 if (lastDelTS != null) { 233 deleteInfo = 234 "; last timestamp when row/column " + rowQual + " was deleted: " + lastDelTS; 235 } 236 assertTrue( 237 "Scanner returned additional key/value: " + kv + ", " + queryInfo + deleteInfo + ";", 238 kvPos < kvs.size()); 239 assertTrue("Scanner returned wrong key/value; " + queryInfo + deleteInfo + ";", 240 PrivateCellUtil.equalsIgnoreMvccVersion(kvs.get(kvPos), kv)); 241 ++kvPos; 242 ++numResults; 243 } 244 results.clear(); 245 } 246 for (; kvPos < kvs.size(); ++kvPos) { 247 KeyValue remainingKV = kvs.get(kvPos); 248 assertFalse( 249 "Matching column not returned by scanner: " + remainingKV + ", " + queryInfo 250 + ", results returned: " + numResults, 251 matchesQuery(remainingKV, qualSet, maxVersions, lastDelTimeMap)); 252 } 253 } 254 } 255 assertTrue("This test is supposed to delete at least some row/column " + "pairs", 256 lastDelTimeMap.size() > 0); 257 LOG.info("Number of row/col pairs deleted at least once: " + lastDelTimeMap.size()); 258 HBaseTestingUtil.closeRegionAndWAL(region); 259 } 260 261 private static String getRowQualStr(Cell kv) { 262 String rowStr = Bytes.toString(CellUtil.cloneRow(kv)); 263 String qualStr = Bytes.toString(CellUtil.cloneQualifier(kv)); 264 return rowStr + "_" + qualStr; 265 } 266 267 private static boolean matchesQuery(KeyValue kv, Set<String> qualSet, int maxVersions, 268 Map<String, Long> lastDelTimeMap) { 269 Long lastDelTS = lastDelTimeMap.get(getRowQualStr(kv)); 270 long ts = kv.getTimestamp(); 271 return qualSet.contains(qualStr(kv)) && ts >= TIMESTAMPS[TIMESTAMPS.length - maxVersions] 272 && (lastDelTS == null || ts > lastDelTS); 273 } 274 275 private static String qualStr(KeyValue kv) { 276 return Bytes.toString(kv.getQualifierArray(), kv.getQualifierOffset(), kv.getQualifierLength()); 277 } 278 279 static String createValue(String row, String qual, long ts) { 280 return "value_for_" + row + "_" + qual + "_" + ts; 281 } 282 283 private static List<String> sequentialStrings(String prefix, int n) { 284 List<String> lst = new ArrayList<>(); 285 for (int i = 0; i < n; ++i) { 286 StringBuilder sb = new StringBuilder(); 287 sb.append(prefix + i); 288 289 // Make column length depend on i. 290 int iBitShifted = i; 291 while (iBitShifted != 0) { 292 sb.append((iBitShifted & 1) == 0 ? 'a' : 'b'); 293 iBitShifted >>= 1; 294 } 295 296 lst.add(sb.toString()); 297 } 298 return lst; 299 } 300}