View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements. See the NOTICE file distributed with this
4    * work for additional information regarding copyright ownership. The ASF
5    * licenses this file to you under the Apache License, Version 2.0 (the
6    * "License"); you may not use this file except in compliance with the License.
7    * You may obtain a copy of the License at
8    *
9    * http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13   * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14   * License for the specific language governing permissions and limitations
15   * under the License.
16   */
17  package org.apache.hadoop.hbase.io.encoding;
18  
19  import java.io.DataInputStream;
20  import java.io.DataOutputStream;
21  import java.io.IOException;
22  import java.nio.ByteBuffer;
23  
24  import org.apache.hadoop.hbase.KeyValue;
25  import org.apache.hadoop.hbase.util.ByteBufferUtils;
26  import org.apache.hadoop.hbase.util.Bytes;
27  import org.apache.hadoop.io.RawComparator;
28  
29  /**
30   * Compress using:
31   * - store size of common prefix
32   * - save column family once, it is same within HFile
33   * - use integer compression for key, value and prefix (7-bit encoding)
34   * - use bits to avoid duplication key length, value length
35   *   and type if it same as previous
36   * - store in 3 bits length of timestamp field
37   * - allow diff in timestamp instead of actual value
38   *
39   * Format:
40   * - 1 byte:    flag
41   * - 1-5 bytes: key length (only if FLAG_SAME_KEY_LENGTH is not set in flag)
42   * - 1-5 bytes: value length (only if FLAG_SAME_VALUE_LENGTH is not set in flag)
43   * - 1-5 bytes: prefix length
44   * - ... bytes: rest of the row (if prefix length is small enough)
45   * - ... bytes: qualifier (or suffix depending on prefix length)
46   * - 1-8 bytes: timestamp or diff
47   * - 1 byte:    type (only if FLAG_SAME_TYPE is not set in the flag)
48   * - ... bytes: value
49   */
50  public class DiffKeyDeltaEncoder extends BufferedDataBlockEncoder {
51    static final int FLAG_SAME_KEY_LENGTH = 1;
52    static final int FLAG_SAME_VALUE_LENGTH = 1 << 1;
53    static final int FLAG_SAME_TYPE = 1 << 2;
54    static final int FLAG_TIMESTAMP_IS_DIFF = 1 << 3;
55    static final int MASK_TIMESTAMP_LENGTH = (1 << 4) | (1 << 5) | (1 << 6);
56    static final int SHIFT_TIMESTAMP_LENGTH = 4;
57    static final int FLAG_TIMESTAMP_SIGN = 1 << 7;
58  
59    protected static class DiffCompressionState extends CompressionState {
60      long timestamp;
61      byte[] familyNameWithSize;
62  
63      @Override
64      protected void readTimestamp(ByteBuffer in) {
65        timestamp = in.getLong();
66      }
67  
68      @Override
69      void copyFrom(CompressionState state) {
70        super.copyFrom(state);
71        DiffCompressionState state2 = (DiffCompressionState) state;
72        timestamp = state2.timestamp;
73      }
74    }
75  
76    private void compressSingleKeyValue(DiffCompressionState previousState,
77        DiffCompressionState currentState, DataOutputStream out,
78        ByteBuffer in) throws IOException {
79      byte flag = 0;
80      int kvPos = in.position();
81      int keyLength = in.getInt();
82      int valueLength = in.getInt();
83  
84      long timestamp;
85      long diffTimestamp = 0;
86      int diffTimestampFitsInBytes = 0;
87  
88      int commonPrefix;
89  
90      int timestampFitsInBytes;
91  
92      if (previousState.isFirst()) {
93        currentState.readKey(in, keyLength, valueLength);
94        currentState.prevOffset = kvPos;
95        timestamp = currentState.timestamp;
96        if (timestamp < 0) {
97          flag |= FLAG_TIMESTAMP_SIGN;
98          timestamp = -timestamp;
99        }
100       timestampFitsInBytes = ByteBufferUtils.longFitsIn(timestamp);
101 
102       flag |= (timestampFitsInBytes - 1) << SHIFT_TIMESTAMP_LENGTH;
103       commonPrefix = 0;
104 
105       // put column family
106       in.mark();
107       ByteBufferUtils.skip(in, currentState.rowLength
108           + KeyValue.ROW_LENGTH_SIZE);
109       ByteBufferUtils.moveBufferToStream(out, in, currentState.familyLength
110           + KeyValue.FAMILY_LENGTH_SIZE);
111       in.reset();
112     } else {
113       // find a common prefix and skip it
114       commonPrefix =
115           ByteBufferUtils.findCommonPrefix(in, in.position(),
116               previousState.prevOffset + KeyValue.ROW_OFFSET, keyLength
117                   - KeyValue.TIMESTAMP_TYPE_SIZE);
118       // don't compress timestamp and type using prefix
119 
120       currentState.readKey(in, keyLength, valueLength,
121           commonPrefix, previousState);
122       currentState.prevOffset = kvPos;
123       timestamp = currentState.timestamp;
124       boolean negativeTimestamp = timestamp < 0;
125       if (negativeTimestamp) {
126         timestamp = -timestamp;
127       }
128       timestampFitsInBytes = ByteBufferUtils.longFitsIn(timestamp);
129 
130       if (keyLength == previousState.keyLength) {
131         flag |= FLAG_SAME_KEY_LENGTH;
132       }
133       if (valueLength == previousState.valueLength) {
134         flag |= FLAG_SAME_VALUE_LENGTH;
135       }
136       if (currentState.type == previousState.type) {
137         flag |= FLAG_SAME_TYPE;
138       }
139 
140       // encode timestamp
141       diffTimestamp = previousState.timestamp - currentState.timestamp;
142       boolean minusDiffTimestamp = diffTimestamp < 0;
143       if (minusDiffTimestamp) {
144         diffTimestamp = -diffTimestamp;
145       }
146       diffTimestampFitsInBytes = ByteBufferUtils.longFitsIn(diffTimestamp);
147       if (diffTimestampFitsInBytes < timestampFitsInBytes) {
148         flag |= (diffTimestampFitsInBytes - 1) << SHIFT_TIMESTAMP_LENGTH;
149         flag |= FLAG_TIMESTAMP_IS_DIFF;
150         if (minusDiffTimestamp) {
151           flag |= FLAG_TIMESTAMP_SIGN;
152         }
153       } else {
154         flag |= (timestampFitsInBytes - 1) << SHIFT_TIMESTAMP_LENGTH;
155         if (negativeTimestamp) {
156           flag |= FLAG_TIMESTAMP_SIGN;
157         }
158       }
159     }
160 
161     out.write(flag);
162 
163     if ((flag & FLAG_SAME_KEY_LENGTH) == 0) {
164       ByteBufferUtils.putCompressedInt(out, keyLength);
165     }
166     if ((flag & FLAG_SAME_VALUE_LENGTH) == 0) {
167       ByteBufferUtils.putCompressedInt(out, valueLength);
168     }
169 
170     ByteBufferUtils.putCompressedInt(out, commonPrefix);
171     ByteBufferUtils.skip(in, commonPrefix);
172 
173     if (previousState.isFirst() ||
174         commonPrefix < currentState.rowLength + KeyValue.ROW_LENGTH_SIZE) {
175       int restRowLength =
176           currentState.rowLength + KeyValue.ROW_LENGTH_SIZE - commonPrefix;
177       ByteBufferUtils.moveBufferToStream(out, in, restRowLength);
178       ByteBufferUtils.skip(in, currentState.familyLength +
179           KeyValue.FAMILY_LENGTH_SIZE);
180       ByteBufferUtils.moveBufferToStream(out, in, currentState.qualifierLength);
181     } else {
182       ByteBufferUtils.moveBufferToStream(out, in,
183           keyLength - commonPrefix - KeyValue.TIMESTAMP_TYPE_SIZE);
184     }
185 
186     if ((flag & FLAG_TIMESTAMP_IS_DIFF) == 0) {
187       ByteBufferUtils.putLong(out, timestamp, timestampFitsInBytes);
188     } else {
189       ByteBufferUtils.putLong(out, diffTimestamp, diffTimestampFitsInBytes);
190     }
191 
192     if ((flag & FLAG_SAME_TYPE) == 0) {
193       out.write(currentState.type);
194     }
195     ByteBufferUtils.skip(in, KeyValue.TIMESTAMP_TYPE_SIZE);
196 
197     ByteBufferUtils.moveBufferToStream(out, in, valueLength);
198   }
199 
200   private void uncompressSingleKeyValue(DataInputStream source,
201       ByteBuffer buffer,
202       DiffCompressionState state)
203           throws IOException, EncoderBufferTooSmallException {
204     // read the column family at the beginning
205     if (state.isFirst()) {
206       state.familyLength = source.readByte();
207       state.familyNameWithSize =
208           new byte[(state.familyLength & 0xff) + KeyValue.FAMILY_LENGTH_SIZE];
209       state.familyNameWithSize[0] = state.familyLength;
210       source.read(state.familyNameWithSize, KeyValue.FAMILY_LENGTH_SIZE,
211           state.familyLength);
212     }
213 
214     // read flag
215     byte flag = source.readByte();
216 
217     // read key/value/common lengths
218     int keyLength;
219     int valueLength;
220     if ((flag & FLAG_SAME_KEY_LENGTH) != 0) {
221       keyLength = state.keyLength;
222     } else {
223       keyLength = ByteBufferUtils.readCompressedInt(source);
224     }
225     if ((flag & FLAG_SAME_VALUE_LENGTH) != 0) {
226       valueLength = state.valueLength;
227     } else {
228       valueLength = ByteBufferUtils.readCompressedInt(source);
229     }
230     int commonPrefix = ByteBufferUtils.readCompressedInt(source);
231 
232     // create KeyValue buffer and fill it prefix
233     int keyOffset = buffer.position();
234     ByteBufferUtils.ensureSpace(buffer, keyLength + valueLength
235         + KeyValue.ROW_OFFSET);
236     buffer.putInt(keyLength);
237     buffer.putInt(valueLength);
238 
239     // copy common from previous key
240     if (commonPrefix > 0) {
241       ByteBufferUtils.copyFromBufferToBuffer(buffer, buffer, state.prevOffset
242           + KeyValue.ROW_OFFSET, commonPrefix);
243     }
244 
245     // copy the rest of the key from the buffer
246     int keyRestLength;
247     if (state.isFirst() || commonPrefix <
248         state.rowLength + KeyValue.ROW_LENGTH_SIZE) {
249       // omit the family part of the key, it is always the same
250       short rowLength;
251       int rowRestLength;
252 
253       // check length of row
254       if (commonPrefix < KeyValue.ROW_LENGTH_SIZE) {
255         // not yet copied, do it now
256         ByteBufferUtils.copyFromStreamToBuffer(buffer, source,
257             KeyValue.ROW_LENGTH_SIZE - commonPrefix);
258         ByteBufferUtils.skip(buffer, -KeyValue.ROW_LENGTH_SIZE);
259         rowLength = buffer.getShort();
260         rowRestLength = rowLength;
261       } else {
262         // already in buffer, just read it
263         rowLength = buffer.getShort(keyOffset + KeyValue.ROW_OFFSET);
264         rowRestLength = rowLength + KeyValue.ROW_LENGTH_SIZE - commonPrefix;
265       }
266 
267       // copy the rest of row
268       ByteBufferUtils.copyFromStreamToBuffer(buffer, source, rowRestLength);
269       state.rowLength = rowLength;
270 
271       // copy the column family
272       buffer.put(state.familyNameWithSize);
273 
274       keyRestLength = keyLength - rowLength -
275           state.familyNameWithSize.length -
276           (KeyValue.ROW_LENGTH_SIZE + KeyValue.TIMESTAMP_TYPE_SIZE);
277     } else {
278       // prevRowWithSizeLength is the same as on previous row
279       keyRestLength = keyLength - commonPrefix - KeyValue.TIMESTAMP_TYPE_SIZE;
280     }
281     // copy the rest of the key, after column family -> column qualifier
282     ByteBufferUtils.copyFromStreamToBuffer(buffer, source, keyRestLength);
283 
284     // handle timestamp
285     int timestampFitsInBytes =
286         ((flag & MASK_TIMESTAMP_LENGTH) >>> SHIFT_TIMESTAMP_LENGTH) + 1;
287     long timestamp = ByteBufferUtils.readLong(source, timestampFitsInBytes);
288     if ((flag & FLAG_TIMESTAMP_SIGN) != 0) {
289       timestamp = -timestamp;
290     }
291     if ((flag & FLAG_TIMESTAMP_IS_DIFF) != 0) {
292       timestamp = state.timestamp - timestamp;
293     }
294     buffer.putLong(timestamp);
295 
296     // copy the type field
297     byte type;
298     if ((flag & FLAG_SAME_TYPE) != 0) {
299       type = state.type;
300     } else {
301       type = source.readByte();
302     }
303     buffer.put(type);
304 
305     // copy value part
306     ByteBufferUtils.copyFromStreamToBuffer(buffer, source, valueLength);
307 
308     state.keyLength = keyLength;
309     state.valueLength = valueLength;
310     state.prevOffset = keyOffset;
311     state.timestamp = timestamp;
312     state.type = type;
313     // state.qualifier is unused
314   }
315 
316   @Override
317   public void compressKeyValues(DataOutputStream out,
318       ByteBuffer in, boolean includesMemstoreTS) throws IOException {
319     in.rewind();
320     ByteBufferUtils.putInt(out, in.limit());
321     DiffCompressionState previousState = new DiffCompressionState();
322     DiffCompressionState currentState = new DiffCompressionState();
323     while (in.hasRemaining()) {
324       compressSingleKeyValue(previousState, currentState,
325           out, in);
326       afterEncodingKeyValue(in, out, includesMemstoreTS);
327 
328       // swap previousState <-> currentState
329       DiffCompressionState tmp = previousState;
330       previousState = currentState;
331       currentState = tmp;
332     }
333   }
334 
335   @Override
336   public ByteBuffer uncompressKeyValues(DataInputStream source,
337       int allocHeaderLength, int skipLastBytes, boolean includesMemstoreTS)
338       throws IOException {
339     int decompressedSize = source.readInt();
340     ByteBuffer buffer = ByteBuffer.allocate(decompressedSize +
341         allocHeaderLength);
342     buffer.position(allocHeaderLength);
343     DiffCompressionState state = new DiffCompressionState();
344     while (source.available() > skipLastBytes) {
345       uncompressSingleKeyValue(source, buffer, state);
346       afterDecodingKeyValue(source, buffer, includesMemstoreTS);
347     }
348 
349     if (source.available() != skipLastBytes) {
350       throw new IllegalStateException("Read too much bytes.");
351     }
352 
353     return buffer;
354   }
355 
356   @Override
357   public ByteBuffer getFirstKeyInBlock(ByteBuffer block) {
358     block.mark();
359     block.position(Bytes.SIZEOF_INT);
360     byte familyLength = block.get();
361     ByteBufferUtils.skip(block, familyLength);
362     byte flag = block.get();
363     int keyLength = ByteBufferUtils.readCompressedInt(block);
364     ByteBufferUtils.readCompressedInt(block); // valueLength
365     ByteBufferUtils.readCompressedInt(block); // commonLength
366     ByteBuffer result = ByteBuffer.allocate(keyLength);
367 
368     // copy row
369     int pos = result.arrayOffset();
370     block.get(result.array(), pos, Bytes.SIZEOF_SHORT);
371     pos += Bytes.SIZEOF_SHORT;
372     short rowLength = result.getShort();
373     block.get(result.array(), pos, rowLength);
374     pos += rowLength;
375 
376     // copy family
377     int savePosition = block.position();
378     block.position(Bytes.SIZEOF_INT);
379     block.get(result.array(), pos, familyLength + Bytes.SIZEOF_BYTE);
380     pos += familyLength + Bytes.SIZEOF_BYTE;
381 
382     // copy qualifier
383     block.position(savePosition);
384     int qualifierLength =
385         keyLength - pos + result.arrayOffset() - KeyValue.TIMESTAMP_TYPE_SIZE;
386     block.get(result.array(), pos, qualifierLength);
387     pos += qualifierLength;
388 
389     // copy the timestamp and type
390     int timestampFitInBytes =
391         ((flag & MASK_TIMESTAMP_LENGTH) >>> SHIFT_TIMESTAMP_LENGTH) + 1;
392     long timestamp = ByteBufferUtils.readLong(block, timestampFitInBytes);
393     if ((flag & FLAG_TIMESTAMP_SIGN) != 0) {
394       timestamp = -timestamp;
395     }
396     result.putLong(pos, timestamp);
397     pos += Bytes.SIZEOF_LONG;
398     block.get(result.array(), pos, Bytes.SIZEOF_BYTE);
399 
400     block.reset();
401     return result;
402   }
403 
404   @Override
405   public String toString() {
406     return DiffKeyDeltaEncoder.class.getSimpleName();
407   }
408 
409   protected static class DiffSeekerState extends SeekerState {
410     private int rowLengthWithSize;
411     private long timestamp;
412 
413     @Override
414     protected void copyFromNext(SeekerState that) {
415       super.copyFromNext(that);
416       DiffSeekerState other = (DiffSeekerState) that;
417       rowLengthWithSize = other.rowLengthWithSize;
418       timestamp = other.timestamp;
419     }
420   }
421 
422   @Override
423   public EncodedSeeker createSeeker(RawComparator<byte[]> comparator,
424       final boolean includesMemstoreTS) {
425     return new BufferedEncodedSeeker<DiffSeekerState>(comparator) {
426       private byte[] familyNameWithSize;
427       private static final int TIMESTAMP_WITH_TYPE_LENGTH =
428           Bytes.SIZEOF_LONG + Bytes.SIZEOF_BYTE;
429 
430       private void decode(boolean isFirst) {
431         byte flag = currentBuffer.get();
432         byte type = 0;
433         if ((flag & FLAG_SAME_KEY_LENGTH) == 0) {
434           if (!isFirst) {
435             type = current.keyBuffer[current.keyLength - Bytes.SIZEOF_BYTE];
436           }
437           current.keyLength = ByteBufferUtils.readCompressedInt(currentBuffer);
438         }
439         if ((flag & FLAG_SAME_VALUE_LENGTH) == 0) {
440           current.valueLength =
441               ByteBufferUtils.readCompressedInt(currentBuffer);
442         }
443         current.lastCommonPrefix =
444             ByteBufferUtils.readCompressedInt(currentBuffer);
445 
446         current.ensureSpaceForKey();
447 
448         if (current.lastCommonPrefix < Bytes.SIZEOF_SHORT) {
449           // length of row is different, copy everything except family
450 
451           // copy the row size
452           currentBuffer.get(current.keyBuffer, current.lastCommonPrefix,
453               Bytes.SIZEOF_SHORT - current.lastCommonPrefix);
454           current.rowLengthWithSize = Bytes.toShort(current.keyBuffer, 0) +
455               Bytes.SIZEOF_SHORT;
456 
457           // copy the rest of row
458           currentBuffer.get(current.keyBuffer, Bytes.SIZEOF_SHORT,
459               current.rowLengthWithSize - Bytes.SIZEOF_SHORT);
460 
461           // copy the column family
462           System.arraycopy(familyNameWithSize, 0, current.keyBuffer,
463               current.rowLengthWithSize, familyNameWithSize.length);
464 
465           // copy the qualifier
466           currentBuffer.get(current.keyBuffer,
467               current.rowLengthWithSize + familyNameWithSize.length,
468               current.keyLength - current.rowLengthWithSize -
469               familyNameWithSize.length - TIMESTAMP_WITH_TYPE_LENGTH);
470         } else if (current.lastCommonPrefix < current.rowLengthWithSize) {
471           // we have to copy part of row and qualifier,
472           // but column family is in right place
473 
474           // before column family (rest of row)
475           currentBuffer.get(current.keyBuffer, current.lastCommonPrefix,
476               current.rowLengthWithSize - current.lastCommonPrefix);
477 
478           // after column family (qualifier)
479           currentBuffer.get(current.keyBuffer,
480               current.rowLengthWithSize + familyNameWithSize.length,
481               current.keyLength - current.rowLengthWithSize -
482               familyNameWithSize.length - TIMESTAMP_WITH_TYPE_LENGTH);
483         } else {
484           // copy just the ending
485           currentBuffer.get(current.keyBuffer, current.lastCommonPrefix,
486               current.keyLength - TIMESTAMP_WITH_TYPE_LENGTH -
487               current.lastCommonPrefix);
488         }
489 
490         // timestamp
491         int pos = current.keyLength - TIMESTAMP_WITH_TYPE_LENGTH;
492         int timestampFitInBytes = 1 +
493             ((flag & MASK_TIMESTAMP_LENGTH) >>> SHIFT_TIMESTAMP_LENGTH);
494         long timestampOrDiff =
495             ByteBufferUtils.readLong(currentBuffer, timestampFitInBytes);
496         if ((flag & FLAG_TIMESTAMP_SIGN) != 0) {
497           timestampOrDiff = -timestampOrDiff;
498         }
499         if ((flag & FLAG_TIMESTAMP_IS_DIFF) == 0) { // it is timestamp
500           current.timestamp = timestampOrDiff;
501         } else { // it is diff
502           current.timestamp = current.timestamp - timestampOrDiff;
503         }
504         Bytes.putLong(current.keyBuffer, pos, current.timestamp);
505         pos += Bytes.SIZEOF_LONG;
506 
507         // type
508         if ((flag & FLAG_SAME_TYPE) == 0) {
509           currentBuffer.get(current.keyBuffer, pos, Bytes.SIZEOF_BYTE);
510         } else if ((flag & FLAG_SAME_KEY_LENGTH) == 0) {
511           current.keyBuffer[pos] = type;
512         }
513 
514         current.valueOffset = currentBuffer.position();
515         ByteBufferUtils.skip(currentBuffer, current.valueLength);
516 
517         if (includesMemstoreTS) {
518           current.memstoreTS = ByteBufferUtils.readVLong(currentBuffer);
519         } else {
520           current.memstoreTS = 0;
521         }
522         current.nextKvOffset = currentBuffer.position();
523       }
524 
525       @Override
526       protected void decodeFirst() {
527         ByteBufferUtils.skip(currentBuffer, Bytes.SIZEOF_INT);
528 
529         // read column family
530         byte familyNameLength = currentBuffer.get();
531         familyNameWithSize = new byte[familyNameLength + Bytes.SIZEOF_BYTE];
532         familyNameWithSize[0] = familyNameLength;
533         currentBuffer.get(familyNameWithSize, Bytes.SIZEOF_BYTE,
534             familyNameLength);
535         decode(true);
536       }
537 
538       @Override
539       protected void decodeNext() {
540         decode(false);
541       }
542 
543       @Override
544       protected DiffSeekerState createSeekerState() {
545         return new DiffSeekerState();
546       }
547     };
548   }
549 }