1 /** 2 * Copyright 2011 The Apache Software Foundation 3 * 4 * Licensed to the Apache Software Foundation (ASF) under one 5 * or more contributor license agreements. See the NOTICE file 6 * distributed with this work for additional information 7 * regarding copyright ownership. The ASF licenses this file 8 * to you under the Apache License, Version 2.0 (the 9 * "License"); you may not use this file except in compliance 10 * with the License. You may obtain a copy of the License at 11 * 12 * http://www.apache.org/licenses/LICENSE-2.0 13 * 14 * Unless required by applicable law or agreed to in writing, software 15 * distributed under the License is distributed on an "AS IS" BASIS, 16 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 * See the License for the specific language governing permissions and 18 * limitations under the License. 19 */ 20 package org.apache.hadoop.hbase.regionserver; 21 22 import java.util.concurrent.atomic.AtomicInteger; 23 import java.util.concurrent.atomic.AtomicReference; 24 25 import org.apache.hadoop.conf.Configuration; 26 import com.google.common.base.Preconditions; 27 28 /** 29 * A memstore-local allocation buffer. 30 * <p> 31 * The MemStoreLAB is basically a bump-the-pointer allocator that allocates 32 * big (2MB) byte[] chunks from and then doles it out to threads that request 33 * slices into the array. 34 * <p> 35 * The purpose of this class is to combat heap fragmentation in the 36 * regionserver. By ensuring that all KeyValues in a given memstore refer 37 * only to large chunks of contiguous memory, we ensure that large blocks 38 * get freed up when the memstore is flushed. 39 * <p> 40 * Without the MSLAB, the byte array allocated during insertion end up 41 * interleaved throughout the heap, and the old generation gets progressively 42 * more fragmented until a stop-the-world compacting collection occurs. 43 * <p> 44 * TODO: we should probably benchmark whether word-aligning the allocations 45 * would provide a performance improvement - probably would speed up the 46 * Bytes.toLong/Bytes.toInt calls in KeyValue, but some of those are cached 47 * anyway 48 */ 49 public class MemStoreLAB { 50 private AtomicReference<Chunk> curChunk = new AtomicReference<Chunk>(); 51 52 final static String CHUNK_SIZE_KEY = "hbase.hregion.memstore.mslab.chunksize"; 53 final static int CHUNK_SIZE_DEFAULT = 2048 * 1024; 54 final int chunkSize; 55 56 final static String MAX_ALLOC_KEY = "hbase.hregion.memstore.mslab.max.allocation"; 57 final static int MAX_ALLOC_DEFAULT = 256 * 1024; // allocs bigger than this don't go through allocator 58 final int maxAlloc; 59 60 public MemStoreLAB() { 61 this(new Configuration()); 62 } 63 64 public MemStoreLAB(Configuration conf) { 65 chunkSize = conf.getInt(CHUNK_SIZE_KEY, CHUNK_SIZE_DEFAULT); 66 maxAlloc = conf.getInt(MAX_ALLOC_KEY, MAX_ALLOC_DEFAULT); 67 68 // if we don't exclude allocations >CHUNK_SIZE, we'd infiniteloop on one! 69 Preconditions.checkArgument( 70 maxAlloc <= chunkSize, 71 MAX_ALLOC_KEY + " must be less than " + CHUNK_SIZE_KEY); 72 } 73 74 /** 75 * Allocate a slice of the given length. 76 * 77 * If the size is larger than the maximum size specified for this 78 * allocator, returns null. 79 */ 80 public Allocation allocateBytes(int size) { 81 Preconditions.checkArgument(size >= 0, "negative size"); 82 83 // Callers should satisfy large allocations directly from JVM since they 84 // don't cause fragmentation as badly. 85 if (size > maxAlloc) { 86 return null; 87 } 88 89 while (true) { 90 Chunk c = getOrMakeChunk(); 91 92 // Try to allocate from this chunk 93 int allocOffset = c.alloc(size); 94 if (allocOffset != -1) { 95 // We succeeded - this is the common case - small alloc 96 // from a big buffer 97 return new Allocation(c.data, allocOffset); 98 } 99 100 // not enough space! 101 // try to retire this chunk 102 tryRetireChunk(c); 103 } 104 } 105 106 /** 107 * Try to retire the current chunk if it is still 108 * <code>c</code>. Postcondition is that curChunk.get() 109 * != c 110 */ 111 private void tryRetireChunk(Chunk c) { 112 @SuppressWarnings("unused") 113 boolean weRetiredIt = curChunk.compareAndSet(c, null); 114 // If the CAS succeeds, that means that we won the race 115 // to retire the chunk. We could use this opportunity to 116 // update metrics on external fragmentation. 117 // 118 // If the CAS fails, that means that someone else already 119 // retired the chunk for us. 120 } 121 122 /** 123 * Get the current chunk, or, if there is no current chunk, 124 * allocate a new one from the JVM. 125 */ 126 private Chunk getOrMakeChunk() { 127 while (true) { 128 // Try to get the chunk 129 Chunk c = curChunk.get(); 130 if (c != null) { 131 return c; 132 } 133 134 // No current chunk, so we want to allocate one. We race 135 // against other allocators to CAS in an uninitialized chunk 136 // (which is cheap to allocate) 137 c = new Chunk(chunkSize); 138 if (curChunk.compareAndSet(null, c)) { 139 // we won race - now we need to actually do the expensive 140 // allocation step 141 c.init(); 142 return c; 143 } 144 // someone else won race - that's fine, we'll try to grab theirs 145 // in the next iteration of the loop. 146 } 147 } 148 149 /** 150 * A chunk of memory out of which allocations are sliced. 151 */ 152 private static class Chunk { 153 /** Actual underlying data */ 154 private byte[] data; 155 156 private static final int UNINITIALIZED = -1; 157 /** 158 * Offset for the next allocation, or the sentinel value -1 159 * which implies that the chunk is still uninitialized. 160 * */ 161 private AtomicInteger nextFreeOffset = new AtomicInteger(UNINITIALIZED); 162 163 /** Total number of allocations satisfied from this buffer */ 164 private AtomicInteger allocCount = new AtomicInteger(); 165 166 /** Size of chunk in bytes */ 167 private final int size; 168 169 /** 170 * Create an uninitialized chunk. Note that memory is not allocated yet, so 171 * this is cheap. 172 * @param size in bytes 173 */ 174 private Chunk(int size) { 175 this.size = size; 176 } 177 178 /** 179 * Actually claim the memory for this chunk. This should only be called from 180 * the thread that constructed the chunk. It is thread-safe against other 181 * threads calling alloc(), who will block until the allocation is complete. 182 */ 183 public void init() { 184 assert nextFreeOffset.get() == UNINITIALIZED; 185 data = new byte[size]; 186 // Mark that it's ready for use 187 boolean initted = nextFreeOffset.compareAndSet( 188 UNINITIALIZED, 0); 189 // We should always succeed the above CAS since only one thread 190 // calls init()! 191 Preconditions.checkState(initted, 192 "Multiple threads tried to init same chunk"); 193 } 194 195 /** 196 * Try to allocate <code>size</code> bytes from the chunk. 197 * @return the offset of the successful allocation, or -1 to indicate not-enough-space 198 */ 199 public int alloc(int size) { 200 while (true) { 201 int oldOffset = nextFreeOffset.get(); 202 if (oldOffset == UNINITIALIZED) { 203 // The chunk doesn't have its data allocated yet. 204 // Since we found this in curChunk, we know that whoever 205 // CAS-ed it there is allocating it right now. So spin-loop 206 // shouldn't spin long! 207 Thread.yield(); 208 continue; 209 } 210 211 if (oldOffset + size > data.length) { 212 return -1; // alloc doesn't fit 213 } 214 215 // Try to atomically claim this chunk 216 if (nextFreeOffset.compareAndSet(oldOffset, oldOffset + size)) { 217 // we got the alloc 218 allocCount.incrementAndGet(); 219 return oldOffset; 220 } 221 // we raced and lost alloc, try again 222 } 223 } 224 225 @Override 226 public String toString() { 227 return "Chunk@" + System.identityHashCode(this) + 228 " allocs=" + allocCount.get() + "waste=" + 229 (data.length - nextFreeOffset.get()); 230 } 231 } 232 233 /** 234 * The result of a single allocation. Contains the chunk that the 235 * allocation points into, and the offset in this array where the 236 * slice begins. 237 */ 238 public static class Allocation { 239 private final byte[] data; 240 private final int offset; 241 242 private Allocation(byte[] data, int off) { 243 this.data = data; 244 this.offset = off; 245 } 246 247 @Override 248 public String toString() { 249 return "Allocation(data=" + data + 250 " with capacity=" + data.length + 251 ", off=" + offset + ")"; 252 } 253 254 byte[] getData() { 255 return data; 256 } 257 258 int getOffset() { 259 return offset; 260 } 261 } 262 }