1 /**
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 */
18 package org.apache.hadoop.hbase.util;
19
20 import static org.apache.hadoop.hbase.util.Order.ASCENDING;
21 import static org.apache.hadoop.hbase.util.Order.DESCENDING;
22
23 import java.math.BigDecimal;
24 import java.math.BigInteger;
25 import java.math.MathContext;
26 import java.math.RoundingMode;
27 import java.nio.charset.Charset;
28 import java.util.Comparator;
29
30 import org.apache.hadoop.classification.InterfaceAudience;
31 import org.apache.hadoop.classification.InterfaceStability;
32
33 import com.google.common.annotations.VisibleForTesting;
34
35 /**
36 * Utility class that handles ordered byte arrays. That is, unlike
37 * {@link Bytes}, these methods produce byte arrays which maintain the sort
38 * order of the original values.
39 * <h3>Encoding Format summary</h3>
40 * <p>
41 * Each value is encoded as one or more bytes. The first byte of the encoding,
42 * its meaning, and a terse description of the bytes that follow is given by
43 * the following table:
44 * <table>
45 * <tr><th>Content Type</th><th>Encoding</th></tr>
46 * <tr><td>NULL</td><td>0x05</td></tr>
47 * <tr><td>negative infinity</td><td>0x07</td></tr>
48 * <tr><td>negative large</td><td>0x08, ~E, ~M</td></tr>
49 * <tr><td>negative medium</td><td>0x13-E, ~M</td></tr>
50 * <tr><td>negative small</td><td>0x14, -E, ~M</td></tr>
51 * <tr><td>zero</td><td>0x15</td></tr>
52 * <tr><td>positive small</td><td>0x16, ~-E, M</td></tr>
53 * <tr><td>positive medium</td><td>0x17+E, M</td></tr>
54 * <tr><td>positive large</td><td>0x22, E, M</td></tr>
55 * <tr><td>positive infinity</td><td>0x23</td></tr>
56 * <tr><td>NaN</td><td>0x25</td></tr>
57 * <tr><td>fixed-length 32-bit integer</td><td>0x27, I</td></tr>
58 * <tr><td>fixed-length 64-bit integer</td><td>0x28, I</td></tr>
59 * <tr><td>fixed-length 32-bit float</td><td>0x30, F</td></tr>
60 * <tr><td>fixed-length 64-bit float</td><td>0x31, F</td></tr>
61 * <tr><td>TEXT</td><td>0x33, T</td></tr>
62 * <tr><td>variable length BLOB</td><td>0x35, B</td></tr>
63 * <tr><td>byte-for-byte BLOB</td><td>0x36, X</td></tr>
64 * </table>
65 * </p>
66 *
67 * <h3>Null Encoding</h3>
68 * <p>
69 * Each value that is a NULL encodes as a single byte of 0x05. Since every
70 * other value encoding begins with a byte greater than 0x05, this forces NULL
71 * values to sort first.
72 * </p>
73 * <h3>Text Encoding</h3>
74 * <p>
75 * Each text value begins with a single byte of 0x33 and ends with a single
76 * byte of 0x00. There are zero or more intervening bytes that encode the text
77 * value. The intervening bytes are chosen so that the encoding will sort in
78 * the desired collating order. The intervening bytes may not contain a 0x00
79 * character; the only 0x00 byte allowed in a text encoding is the final byte.
80 * </p>
81 * <p>
82 * The text encoding ends in 0x00 in order to ensure that when there are two
83 * strings where one is a prefix of the other that the shorter string will
84 * sort first.
85 * </p>
86 * <h3>Binary Encoding</h3>
87 * <p>
88 * There are two encoding strategies for binary fields, referred to as
89 * "BlobVar" and "BlobCopy". BlobVar is less efficient in both space and
90 * encoding time. It has no limitations on the range of encoded values.
91 * BlobCopy is a byte-for-byte copy of the input data followed by a
92 * termination byte. It is extremely fast to encode and decode. It carries the
93 * restriction of not allowing a 0x00 value in the input byte[] as this value
94 * is used as the termination byte.
95 * </p>
96 * <h4>BlobVar</h4>
97 * <p>
98 * "BlobVar" encodes the input byte[] in a manner similar to a variable length
99 * integer encoding. As with the other {@code OrderedBytes} encodings,
100 * the first encoded byte is used to indicate what kind of value follows. This
101 * header byte is 0x35 for BlobVar encoded values. As with the traditional
102 * varint encoding, the most significant bit of each subsequent encoded
103 * {@code byte} is used as a continuation marker. The 7 remaining bits
104 * contain the 7 most significant bits of the first unencoded byte. The next
105 * encoded byte starts with a continuation marker in the MSB. The least
106 * significant bit from the first unencoded byte follows, and the remaining 6
107 * bits contain the 6 MSBs of the second unencoded byte. The encoding
108 * continues, encoding 7 bytes on to 8 encoded bytes. The MSB of the final
109 * encoded byte contains a termination marker rather than a continuation
110 * marker, and any remaining bits from the final input byte. Any trailing bits
111 * in the final encoded byte are zeros.
112 * </p>
113 * <h4>BlobCopy</h4>
114 * <p>
115 * "BlobCopy" is a simple byte-for-byte copy of the input data. It uses 0x36
116 * as the header byte, and is terminated by 0x00 in the DESCENDING case. This
117 * alternative encoding is faster and more space-efficient, but it cannot
118 * accept values containing a 0x00 byte in DESCENDING order.
119 * </p>
120 * <h3>Variable-length Numeric Encoding</h3>
121 * <p>
122 * Numeric values must be coded so as to sort in numeric order. We assume that
123 * numeric values can be both integer and floating point values. Clients must
124 * be careful to use inspection methods for encoded values (such as
125 * {@link #isNumericInfinite(PositionedByteRange)} and
126 * {@link #isNumericNaN(PositionedByteRange)} to protect against decoding
127 * values into object which do not support these numeric concepts (such as
128 * {@link Long} and {@link BigDecimal}).
129 * </p>
130 * <p>
131 * Simplest cases first: If the numeric value is a NaN, then the encoding is a
132 * single byte of 0x25. This causes NaN values to sort after every other
133 * numeric value.
134 * </p>
135 * <p>
136 * If the numeric value is a negative infinity then the encoding is a single
137 * byte of 0x07. Since every other numeric value except NaN has a larger
138 * initial byte, this encoding ensures that negative infinity will sort prior
139 * to every other numeric value other than NaN.
140 * </p>
141 * <p>
142 * If the numeric value is a positive infinity then the encoding is a single
143 * byte of 0x23. Every other numeric value encoding begins with a smaller
144 * byte, ensuring that positive infinity always sorts last among numeric
145 * values. 0x23 is also smaller than 0x33, the initial byte of a text value,
146 * ensuring that every numeric value sorts before every text value.
147 * </p>
148 * <p>
149 * If the numeric value is exactly zero then it is encoded as a single byte of
150 * 0x15. Finite negative values will have initial bytes of 0x08 through 0x14
151 * and finite positive values will have initial bytes of 0x16 through 0x22.
152 * </p>
153 * <p>
154 * For all numeric values, we compute a mantissa M and an exponent E. The
155 * mantissa is a base-100 representation of the value. The exponent E
156 * determines where to put the decimal point.
157 * </p>
158 * <p>
159 * Each centimal digit of the mantissa is stored in a byte. If the value of
160 * the centimal digit is X (hence X≥0 and X≤99) then the byte value will
161 * be 2*X+1 for every byte of the mantissa, except for the last byte which
162 * will be 2*X+0. The mantissa must be the minimum number of bytes necessary
163 * to represent the value; trailing X==0 digits are omitted. This means that
164 * the mantissa will never contain a byte with the value 0x00.
165 * </p>
166 * <p>
167 * If we assume all digits of the mantissa occur to the right of the decimal
168 * point, then the exponent E is the power of one hundred by which one must
169 * multiply the mantissa to recover the original value.
170 * </p>
171 * <p>
172 * Values are classified as large, medium, or small according to the value of
173 * E. If E is 11 or more, the value is large. For E between 0 and 10, the
174 * value is medium. For E less than zero, the value is small.
175 * </p>
176 * <p>
177 * Large positive values are encoded as a single byte 0x22 followed by E as a
178 * varint and then M. Medium positive values are a single byte of 0x17+E
179 * followed by M. Small positive values are encoded as a single byte 0x16
180 * followed by the ones-complement of the varint for -E followed by M.
181 * </p>
182 * <p>
183 * Small negative values are encoded as a single byte 0x14 followed by -E as a
184 * varint and then the ones-complement of M. Medium negative values are
185 * encoded as a byte 0x13-E followed by the ones-complement of M. Large
186 * negative values consist of the single byte 0x08 followed by the
187 * ones-complement of the varint encoding of E followed by the ones-complement
188 * of M.
189 * </p>
190 * <h3>Fixed-length Integer Encoding</h3>
191 * <p>
192 * All 4-byte integers are serialized to a 5-byte, fixed-width, sortable byte
193 * format. All 8-byte integers are serialized to the equivelant 9-byte format.
194 * Serialization is performed by writing a header byte, inverting the integer
195 * sign bit and writing the resulting bytes to the byte array in big endian
196 * order.
197 * </p>
198 * <h3>Fixed-length Floating Point Encoding</h3>
199 * <p>
200 * 32-bit and 64-bit floating point numbers are encoded to a 5-byte and 9-byte
201 * encoding format, respectively. The format is identical, save for the
202 * precision respected in each step of the operation.
203 * <p>
204 * This format ensures the following total ordering of floating point values:
205 * Float.NEGATIVE_INFINITY < -Float.MAX_VALUE < ... <
206 * -Float.MIN_VALUE < -0.0 < +0.0; < Float.MIN_VALUE < ... <
207 * Float.MAX_VALUE < Float.POSITIVE_INFINITY < Float.NaN
208 * </p>
209 * <p>
210 * Floating point numbers are encoded as specified in IEEE 754. A 32-bit
211 * single precision float consists of a sign bit, 8-bit unsigned exponent
212 * encoded in offset-127 notation, and a 23-bit significand. The format is
213 * described further in the <a
214 * href="http://en.wikipedia.org/wiki/Single_precision"> Single Precision
215 * Floating Point Wikipedia page</a>
216 * </p>
217 * <p>
218 * The value of a normal float is -1 <sup>sign bit</sup> ×
219 * 2<sup>exponent - 127</sup> × 1.significand
220 * </p>
221 * <p>
222 * The IEE754 floating point format already preserves sort ordering for
223 * positive floating point numbers when the raw bytes are compared in most
224 * significant byte order. This is discussed further at <a href=
225 * "http://www.cygnus-software.com/papers/comparingfloats/comparingfloats.htm">
226 * http://www.cygnus-software.com/papers/comparingfloats/comparingfloats.htm</a>
227 * </p>
228 * <p>
229 * Thus, we need only ensure that negative numbers sort in the the exact
230 * opposite order as positive numbers (so that say, negative infinity is less
231 * than negative 1), and that all negative numbers compare less than any
232 * positive number. To accomplish this, we invert the sign bit of all floating
233 * point numbers, and we also invert the exponent and significand bits if the
234 * floating point number was negative.
235 * </p>
236 * <p>
237 * More specifically, we first store the floating point bits into a 32-bit int
238 * {@code j} using {@link Float#floatToIntBits}. This method collapses
239 * all NaNs into a single, canonical NaN value but otherwise leaves the bits
240 * unchanged. We then compute
241 * </p>
242 *
243 * <pre>
244 * j ˆ= (j >> (Integer.SIZE - 1)) | Integer.MIN_SIZE
245 * </pre>
246 * <p>
247 * which inverts the sign bit and XOR's all other bits with the sign bit
248 * itself. Comparing the raw bytes of {@code j} in most significant byte
249 * order is equivalent to performing a single precision floating point
250 * comparison on the underlying bits (ignoring NaN comparisons, as NaNs don't
251 * compare equal to anything when performing floating point comparisons).
252 * </p>
253 * <p>
254 * The resulting integer is then converted into a byte array by serializing
255 * the integer one byte at a time in most significant byte order. The
256 * serialized integer is prefixed by a single header byte. All serialized
257 * values are 5 bytes in length.
258 * </p>
259 * <p>
260 * {@code OrderedBytes} encodings are heavily influenced by the <a href="
261 * http://sqlite.org/src4/doc/trunk/www/key_encoding.wiki">SQLite4 Key
262 * Encoding</a>. Slight deviations are make in the interest of order
263 * correctness and user extensibility. Fixed-width {@code Long} and
264 * {@link Double} encodings are based on implementations from the now defunct
265 * Orderly library.
266 * </p>
267 */
268 @InterfaceAudience.Public
269 @InterfaceStability.Evolving
270 public class OrderedBytes {
271
272 /*
273 * These constants define header bytes used to identify encoded values. Note
274 * that the values here are not exhaustive as the Numeric format encodes
275 * portions of its value within the header byte. The values listed here are
276 * directly applied to persisted data -- DO NOT modify the values specified
277 * here. Instead, gaps are placed intentionally between values so that new
278 * implementations can be inserted into the total ordering enforced here.
279 */
280 private static final byte NULL = 0x05;
281 private static final byte NEG_INF = 0x07;
282 private static final byte NEG_LARGE = 0x08;
283 private static final byte NEG_MED_MIN = 0x09;
284 private static final byte NEG_MED_MAX = 0x13;
285 private static final byte NEG_SMALL = 0x14;
286 private static final byte ZERO = 0x15;
287 private static final byte POS_SMALL = 0x16;
288 private static final byte POS_MED_MIN = 0x17;
289 private static final byte POS_MED_MAX = 0x21;
290 private static final byte POS_LARGE = 0x22;
291 private static final byte POS_INF = 0x23;
292 private static final byte NAN = 0x25;
293 private static final byte FIXED_INT32 = 0x27;
294 private static final byte FIXED_INT64 = 0x28;
295 private static final byte FIXED_FLOAT32 = 0x30;
296 private static final byte FIXED_FLOAT64 = 0x31;
297 private static final byte TEXT = 0x33;
298 private static final byte BLOB_VAR = 0x35;
299 private static final byte BLOB_COPY = 0x36;
300
301 /*
302 * The following constant values are used by encoding implementations
303 */
304
305 public static final Charset UTF8 = Charset.forName("UTF-8");
306 private static final byte TERM = 0x00;
307 private static final BigDecimal E8 = BigDecimal.valueOf(1e8);
308 private static final BigDecimal E32 = BigDecimal.valueOf(1e32);
309 private static final BigDecimal EN2 = BigDecimal.valueOf(1e-2);
310 private static final BigDecimal EN10 = BigDecimal.valueOf(1e-10);
311
312 /**
313 * Max precision guaranteed to fit into a {@code long}.
314 */
315 public static final int MAX_PRECISION = 31;
316
317 /**
318 * The context used to normalize {@link BigDecimal} values.
319 */
320 public static final MathContext DEFAULT_MATH_CONTEXT =
321 new MathContext(MAX_PRECISION, RoundingMode.HALF_UP);
322
323 /**
324 * Creates the standard exception when the encoded header byte is unexpected for the decoding
325 * context.
326 * @param header value used in error message.
327 */
328 private static IllegalArgumentException unexpectedHeader(byte header) {
329 throw new IllegalArgumentException("unexpected value in first byte: 0x"
330 + Long.toHexString(header));
331 }
332
333 /**
334 * Perform unsigned comparison between two long values. Conforms to the same interface as
335 * {@link Comparator#compare(Object, Object)}.
336 */
337 private static int unsignedCmp(long x1, long x2) {
338 int cmp;
339 if ((cmp = (x1 < x2 ? -1 : (x1 == x2 ? 0 : 1))) == 0) return 0;
340 // invert the result when either value is negative
341 if ((x1 < 0) != (x2 < 0)) return -cmp;
342 return cmp;
343 }
344
345 /**
346 * Write a 32-bit unsigned integer to {@code dst} as 4 big-endian bytes.
347 * @return number of bytes written.
348 */
349 private static int putUint32(PositionedByteRange dst, int val) {
350 dst.put((byte) (val >>> 24))
351 .put((byte) (val >>> 16))
352 .put((byte) (val >>> 8))
353 .put((byte) val);
354 return 4;
355 }
356
357 /**
358 * Encode an unsigned 64-bit unsigned integer {@code val} into {@code dst}.
359 * @param dst The destination to which encoded bytes are written.
360 * @param val The value to write.
361 * @param comp Compliment the encoded value when {@code comp} is true.
362 * @return number of bytes written.
363 */
364 @VisibleForTesting
365 static int putVaruint64(PositionedByteRange dst, long val, boolean comp) {
366 int w, y, len = 0;
367 final int offset = dst.getOffset(), start = dst.getPosition();
368 byte[] a = dst.getBytes();
369 Order ord = comp ? DESCENDING : ASCENDING;
370 if (-1 == unsignedCmp(val, 241L)) {
371 dst.put((byte) val);
372 len = dst.getPosition() - start;
373 ord.apply(a, offset + start, len);
374 return len;
375 }
376 if (-1 == unsignedCmp(val, 2288L)) {
377 y = (int) (val - 240);
378 dst.put((byte) (y / 256 + 241))
379 .put((byte) (y % 256));
380 len = dst.getPosition() - start;
381 ord.apply(a, offset + start, len);
382 return len;
383 }
384 if (-1 == unsignedCmp(val, 67824L)) {
385 y = (int) (val - 2288);
386 dst.put((byte) 249)
387 .put((byte) (y / 256))
388 .put((byte) (y % 256));
389 len = dst.getPosition() - start;
390 ord.apply(a, offset + start, len);
391 return len;
392 }
393 y = (int) val;
394 w = (int) (val >>> 32);
395 if (w == 0) {
396 if (-1 == unsignedCmp(y, 16777216L)) {
397 dst.put((byte) 250)
398 .put((byte) (y >>> 16))
399 .put((byte) (y >>> 8))
400 .put((byte) y);
401 len = dst.getPosition() - start;
402 ord.apply(a, offset + start, len);
403 return len;
404 }
405 dst.put((byte) 251);
406 putUint32(dst, y);
407 len = dst.getPosition() - start;
408 ord.apply(a, offset + start, len);
409 return len;
410 }
411 if (-1 == unsignedCmp(w, 256L)) {
412 dst.put((byte) 252)
413 .put((byte) w);
414 putUint32(dst, y);
415 len = dst.getPosition() - start;
416 ord.apply(a, offset + start, len);
417 return len;
418 }
419 if (-1 == unsignedCmp(w, 65536L)) {
420 dst.put((byte) 253)
421 .put((byte) (w >>> 8))
422 .put((byte) w);
423 putUint32(dst, y);
424 len = dst.getPosition() - start;
425 ord.apply(a, offset + start, len);
426 return len;
427 }
428 if (-1 == unsignedCmp(w, 16777216L)) {
429 dst.put((byte) 254)
430 .put((byte) (w >>> 16))
431 .put((byte) (w >>> 8))
432 .put((byte) w);
433 putUint32(dst, y);
434 len = dst.getPosition() - start;
435 ord.apply(a, offset + start, len);
436 return len;
437 }
438 dst.put((byte) 255);
439 putUint32(dst, w);
440 putUint32(dst, y);
441 len = dst.getPosition() - start;
442 ord.apply(a, offset + start, len);
443 return len;
444 }
445
446 /**
447 * Inspect {@code src} for an encoded varuint64 for its length in bytes.
448 * Preserves the state of {@code src}.
449 * @param src source buffer
450 * @param comp if true, parse the compliment of the value.
451 * @return the number of bytes consumed by this value.
452 */
453 @VisibleForTesting
454 static int lengthVaruint64(PositionedByteRange src, boolean comp) {
455 int a0 = (comp ? DESCENDING : ASCENDING).apply(src.peek()) & 0xff;
456 if (a0 <= 240) return 1;
457 if (a0 >= 241 && a0 <= 248) return 2;
458 if (a0 == 249) return 3;
459 if (a0 == 250) return 4;
460 if (a0 == 251) return 5;
461 if (a0 == 252) return 6;
462 if (a0 == 253) return 7;
463 if (a0 == 254) return 8;
464 if (a0 == 255) return 9;
465 throw unexpectedHeader(src.peek());
466 }
467
468 /**
469 * Skip {@code src} over the encoded varuint64.
470 * @param src source buffer
471 * @param cmp if true, parse the compliment of the value.
472 * @return the number of bytes skipped.
473 */
474 @VisibleForTesting
475 static int skipVaruint64(PositionedByteRange src, boolean cmp) {
476 final int len = lengthVaruint64(src, cmp);
477 src.setPosition(src.getPosition() + len);
478 return len;
479 }
480
481 /**
482 * Decode a sequence of bytes in {@code src} as a varuint64. Compliment the
483 * encoded value when {@code comp} is true.
484 * @return the decoded value.
485 */
486 @VisibleForTesting
487 static long getVaruint64(PositionedByteRange src, boolean comp) {
488 assert src.getRemaining() >= lengthVaruint64(src, comp);
489 final long ret;
490 Order ord = comp ? DESCENDING : ASCENDING;
491 byte x = src.get();
492 final int a0 = ord.apply(x) & 0xff, a1, a2, a3, a4, a5, a6, a7, a8;
493 if (-1 == unsignedCmp(a0, 241)) {
494 return a0;
495 }
496 x = src.get();
497 a1 = ord.apply(x) & 0xff;
498 if (-1 == unsignedCmp(a0, 249)) {
499 return (a0 - 241) * 256 + a1 + 240;
500 }
501 x = src.get();
502 a2 = ord.apply(x) & 0xff;
503 if (a0 == 249) {
504 return 2288 + 256 * a1 + a2;
505 }
506 x = src.get();
507 a3 = ord.apply(x) & 0xff;
508 if (a0 == 250) {
509 return (a1 << 16) | (a2 << 8) | a3;
510 }
511 x = src.get();
512 a4 = ord.apply(x) & 0xff;
513 ret = (((long) a1) << 24) | (a2 << 16) | (a3 << 8) | a4;
514 if (a0 == 251) {
515 return ret;
516 }
517 x = src.get();
518 a5 = ord.apply(x) & 0xff;
519 if (a0 == 252) {
520 return (ret << 8) | a5;
521 }
522 x = src.get();
523 a6 = ord.apply(x) & 0xff;
524 if (a0 == 253) {
525 return (ret << 16) | (a5 << 8) | a6;
526 }
527 x = src.get();
528 a7 = ord.apply(x) & 0xff;
529 if (a0 == 254) {
530 return (ret << 24) | (a5 << 16) | (a6 << 8) | a7;
531 }
532 x = src.get();
533 a8 = ord.apply(x) & 0xff;
534 return (ret << 32) | (((long) a5) << 24) | (a6 << 16) | (a7 << 8) | a8;
535 }
536
537 /**
538 * Strip all trailing zeros to ensure that no digit will be zero and round
539 * using our default context to ensure precision doesn't exceed max allowed.
540 * From Phoenix's {@code NumberUtil}.
541 * @return new {@link BigDecimal} instance
542 */
543 @VisibleForTesting
544 static BigDecimal normalize(BigDecimal val) {
545 return null == val ? null : val.stripTrailingZeros().round(DEFAULT_MATH_CONTEXT);
546 }
547
548 /**
549 * Read significand digits from {@code src} according to the magnitude
550 * of {@code e}.
551 * @param src The source from which to read encoded digits.
552 * @param e The magnitude of the first digit read.
553 * @param comp Treat encoded bytes as compliments when {@code comp} is true.
554 * @return The decoded value.
555 * @throws IllegalArgumentException when read exceeds the remaining length
556 * of {@code src}.
557 */
558 private static BigDecimal decodeSignificand(PositionedByteRange src, int e, boolean comp) {
559 // TODO: can this be made faster?
560 byte[] a = src.getBytes();
561 final int start = src.getPosition(), offset = src.getOffset(), remaining = src.getRemaining();
562 Order ord = comp ? DESCENDING : ASCENDING;
563 BigDecimal m = BigDecimal.ZERO;
564 e--;
565 for (int i = 0;; i++) {
566 if (i > remaining) {
567 // we've exceeded this range's window
568 src.setPosition(start);
569 throw new IllegalArgumentException(
570 "Read exceeds range before termination byte found. offset: " + offset + " position: "
571 + (start + i));
572 }
573 // base-100 digits are encoded as val * 2 + 1 except for the termination digit.
574 m = m.add( // m +=
575 new BigDecimal(BigInteger.ONE, e * -2).multiply( // 100 ^ p * [decoded digit]
576 BigDecimal.valueOf((ord.apply(a[offset + start + i]) & 0xff) / 2)));
577 e--;
578 // detect termination digit
579 if ((ord.apply(a[offset + start + i]) & 1) == 0) {
580 src.setPosition(start + i + 1);
581 break;
582 }
583 }
584 return normalize(m);
585 }
586
587 /**
588 * Skip {@code src} over the significand bytes.
589 * @param src The source from which to read encoded digits.
590 * @param comp Treat encoded bytes as compliments when {@code comp} is true.
591 * @return the number of bytes skipped.
592 */
593 private static int skipSignificand(PositionedByteRange src, boolean comp) {
594 byte[] a = src.getBytes();
595 final int offset = src.getOffset(), start = src.getPosition();
596 int i = src.getPosition();
597 while (((comp ? DESCENDING : ASCENDING).apply(a[offset + i++]) & 1) != 0)
598 ;
599 src.setPosition(i);
600 return i - start;
601 }
602
603 /**
604 * <p>
605 * Encode the small magnitude floating point number {@code val} using the
606 * key encoding. The caller guarantees that 1.0 > abs(val) > 0.0.
607 * </p>
608 * <p>
609 * A floating point value is encoded as an integer exponent {@code E} and a
610 * mantissa {@code M}. The original value is equal to {@code (M * 100^E)}.
611 * {@code E} is set to the smallest value possible without making {@code M}
612 * greater than or equal to 1.0.
613 * </p>
614 * <p>
615 * For this routine, {@code E} will always be zero or negative, since the
616 * original value is less than one. The encoding written by this routine is
617 * the ones-complement of the varint of the negative of {@code E} followed
618 * by the mantissa:
619 * <pre>
620 * Encoding: ~-E M
621 * </pre>
622 * </p>
623 * @param dst The destination to which encoded digits are written.
624 * @param val The value to encode.
625 * @return the number of bytes written.
626 */
627 private static int encodeNumericSmall(PositionedByteRange dst, BigDecimal val) {
628 // TODO: this can be done faster?
629 // assert 1.0 > abs(val) > 0.0
630 BigDecimal abs = val.abs();
631 assert BigDecimal.ZERO.compareTo(abs) < 0 && BigDecimal.ONE.compareTo(abs) > 0;
632 byte[] a = dst.getBytes();
633 boolean isNeg = val.signum() == -1;
634 final int offset = dst.getOffset(), start = dst.getPosition();
635 int e = 0, d, startM;
636
637 if (isNeg) { /* Small negative number: 0x14, -E, ~M */
638 dst.put(NEG_SMALL);
639 } else { /* Small positive number: 0x16, ~-E, M */
640 dst.put(POS_SMALL);
641 }
642
643 // normalize abs(val) to determine E
644 while (abs.compareTo(EN10) < 0) { abs = abs.movePointRight(8); e += 4; }
645 while (abs.compareTo(EN2) < 0) { abs = abs.movePointRight(2); e++; }
646
647 putVaruint64(dst, e, !isNeg); // encode appropriate E value.
648
649 // encode M by peeling off centimal digits, encoding x as 2x+1
650 startM = dst.getPosition();
651 // TODO: 18 is an arbitrary encoding limit. Reevaluate once we have a better handling of
652 // numeric scale.
653 for (int i = 0; i < 18 && abs.compareTo(BigDecimal.ZERO) != 0; i++) {
654 abs = abs.movePointRight(2);
655 d = abs.intValue();
656 dst.put((byte) ((2 * d + 1) & 0xff));
657 abs = abs.subtract(BigDecimal.valueOf(d));
658 }
659 a[offset + dst.getPosition() - 1] &= 0xfe; // terminal digit should be 2x
660 if (isNeg) {
661 // negative values encoded as ~M
662 DESCENDING.apply(a, offset + startM, dst.getPosition() - startM);
663 }
664 return dst.getPosition() - start;
665 }
666
667 /**
668 * Encode the large magnitude floating point number {@code val} using
669 * the key encoding. The caller guarantees that {@code val} will be
670 * finite and abs(val) >= 1.0.
671 * <p>
672 * A floating point value is encoded as an integer exponent {@code E}
673 * and a mantissa {@code M}. The original value is equal to
674 * {@code (M * 100^E)}. {@code E} is set to the smallest value
675 * possible without making {@code M} greater than or equal to 1.0.
676 * </p>
677 * <p>
678 * Each centimal digit of the mantissa is stored in a byte. If the value of
679 * the centimal digit is {@code X} (hence {@code X>=0} and
680 * {@code X<=99}) then the byte value will be {@code 2*X+1} for
681 * every byte of the mantissa, except for the last byte which will be
682 * {@code 2*X+0}. The mantissa must be the minimum number of bytes
683 * necessary to represent the value; trailing {@code X==0} digits are
684 * omitted. This means that the mantissa will never contain a byte with the
685 * value {@code 0x00}.
686 * </p>
687 * <p>
688 * If {@code E > 10}, then this routine writes of {@code E} as a
689 * varint followed by the mantissa as described above. Otherwise, if
690 * {@code E <= 10}, this routine only writes the mantissa and leaves
691 * the {@code E} value to be encoded as part of the opening byte of the
692 * field by the calling function.
693 *
694 * <pre>
695 * Encoding: M (if E<=10)
696 * E M (if E>10)
697 * </pre>
698 * </p>
699 * @param dst The destination to which encoded digits are written.
700 * @param val The value to encode.
701 * @return the number of bytes written.
702 */
703 private static int encodeNumericLarge(PositionedByteRange dst, BigDecimal val) {
704 // TODO: this can be done faster
705 BigDecimal abs = val.abs();
706 byte[] a = dst.getBytes();
707 boolean isNeg = val.signum() == -1;
708 final int start = dst.getPosition(), offset = dst.getOffset();
709 int e = 0, d, startM;
710
711 if (isNeg) { /* Large negative number: 0x08, ~E, ~M */
712 dst.put(NEG_LARGE);
713 } else { /* Large positive number: 0x22, E, M */
714 dst.put(POS_LARGE);
715 }
716
717 // normalize abs(val) to determine E
718 while (abs.compareTo(E32) >= 0 && e <= 350) { abs = abs.movePointLeft(32); e +=16; }
719 while (abs.compareTo(E8) >= 0 && e <= 350) { abs = abs.movePointLeft(8); e+= 4; }
720 while (abs.compareTo(BigDecimal.ONE) >= 0 && e <= 350) { abs = abs.movePointLeft(2); e++; }
721
722 // encode appropriate header byte and/or E value.
723 if (e > 10) { /* large number, write out {~,}E */
724 putVaruint64(dst, e, isNeg);
725 } else {
726 if (isNeg) { /* Medium negative number: 0x13-E, ~M */
727 dst.put(start, (byte) (NEG_MED_MAX - e));
728 } else { /* Medium positive number: 0x17+E, M */
729 dst.put(start, (byte) (POS_MED_MIN + e));
730 }
731 }
732
733 // encode M by peeling off centimal digits, encoding x as 2x+1
734 startM = dst.getPosition();
735 // TODO: 18 is an arbitrary encoding limit. Reevaluate once we have a better handling of
736 // numeric scale.
737 for (int i = 0; i < 18 && abs.compareTo(BigDecimal.ZERO) != 0; i++) {
738 abs = abs.movePointRight(2);
739 d = abs.intValue();
740 dst.put((byte) (2 * d + 1));
741 abs = abs.subtract(BigDecimal.valueOf(d));
742 }
743
744 a[offset + dst.getPosition() - 1] &= 0xfe; // terminal digit should be 2x
745 if (isNeg) {
746 // negative values encoded as ~M
747 DESCENDING.apply(a, offset + startM, dst.getPosition() - startM);
748 }
749 return dst.getPosition() - start;
750 }
751
752 /**
753 * Encode a numerical value using the variable-length encoding.
754 * @param dst The destination to which encoded digits are written.
755 * @param val The value to encode.
756 * @param ord The {@link Order} to respect while encoding {@code val}.
757 * @return the number of bytes written.
758 */
759 public static int encodeNumeric(PositionedByteRange dst, long val, Order ord) {
760 return encodeNumeric(dst, BigDecimal.valueOf(val), ord);
761 }
762
763 /**
764 * Encode a numerical value using the variable-length encoding.
765 * @param dst The destination to which encoded digits are written.
766 * @param val The value to encode.
767 * @param ord The {@link Order} to respect while encoding {@code val}.
768 * @return the number of bytes written.
769 */
770 public static int encodeNumeric(PositionedByteRange dst, double val, Order ord) {
771 if (val == 0.0) {
772 dst.put(ord.apply(ZERO));
773 return 1;
774 }
775 if (Double.isNaN(val)) {
776 dst.put(ord.apply(NAN));
777 return 1;
778 }
779 if (val == Double.NEGATIVE_INFINITY) {
780 dst.put(ord.apply(NEG_INF));
781 return 1;
782 }
783 if (val == Double.POSITIVE_INFINITY) {
784 dst.put(ord.apply(POS_INF));
785 return 1;
786 }
787 return encodeNumeric(dst, BigDecimal.valueOf(val), ord);
788 }
789
790 /**
791 * Encode a numerical value using the variable-length encoding.
792 * @param dst The destination to which encoded digits are written.
793 * @param val The value to encode.
794 * @param ord The {@link Order} to respect while encoding {@code val}.
795 * @return the number of bytes written.
796 */
797 public static int encodeNumeric(PositionedByteRange dst, BigDecimal val, Order ord) {
798 final int len, offset = dst.getOffset(), start = dst.getPosition();
799 if (null == val) {
800 return encodeNull(dst, ord);
801 } else if (BigDecimal.ZERO.compareTo(val) == 0) {
802 dst.put(ord.apply(ZERO));
803 return 1;
804 }
805 BigDecimal abs = val.abs();
806 if (BigDecimal.ONE.compareTo(abs) <= 0) { // abs(v) >= 1.0
807 len = encodeNumericLarge(dst, normalize(val));
808 } else { // 1.0 > abs(v) >= 0.0
809 len = encodeNumericSmall(dst, normalize(val));
810 }
811 ord.apply(dst.getBytes(), offset + start, len);
812 return len;
813 }
814
815 /**
816 * Decode a {@link BigDecimal} from {@code src}. Assumes {@code src} encodes
817 * a value in Numeric encoding and is within the valid range of
818 * {@link BigDecimal} values. {@link BigDecimal} does not support {@code NaN}
819 * or {@code Infinte} values.
820 * @see #decodeNumericAsDouble(byte[], int)
821 */
822 private static BigDecimal decodeNumericValue(PositionedByteRange src) {
823 final int e;
824 byte header = src.get();
825 boolean dsc = -1 == Integer.signum(header);
826 header = dsc ? DESCENDING.apply(header) : header;
827
828 if (header == NULL) return null;
829 if (header == NEG_LARGE) { /* Large negative number: 0x08, ~E, ~M */
830 e = (int) getVaruint64(src, !dsc);
831 return decodeSignificand(src, e, !dsc).negate();
832 }
833 if (header >= NEG_MED_MIN && header <= NEG_MED_MAX) {
834 /* Medium negative number: 0x13-E, ~M */
835 e = NEG_MED_MAX - header;
836 return decodeSignificand(src, e, !dsc).negate();
837 }
838 if (header == NEG_SMALL) { /* Small negative number: 0x14, -E, ~M */
839 e = (int) -getVaruint64(src, dsc);
840 return decodeSignificand(src, e, !dsc).negate();
841 }
842 if (header == ZERO) {
843 return BigDecimal.ZERO;
844 }
845 if (header == POS_SMALL) { /* Small positive number: 0x16, ~-E, M */
846 e = (int) -getVaruint64(src, !dsc);
847 return decodeSignificand(src, e, dsc);
848 }
849 if (header >= POS_MED_MIN && header <= POS_MED_MAX) {
850 /* Medium positive number: 0x17+E, M */
851 e = header - POS_MED_MIN;
852 return decodeSignificand(src, e, dsc);
853 }
854 if (header == POS_LARGE) { /* Large positive number: 0x22, E, M */
855 e = (int) getVaruint64(src, dsc);
856 return decodeSignificand(src, e, dsc);
857 }
858 throw unexpectedHeader(header);
859 }
860
861 /**
862 * Decode a primitive {@code double} value from the Numeric encoding. Numeric
863 * encoding is based on {@link BigDecimal}; in the event the encoded value is
864 * larger than can be represented in a {@code double}, this method performs
865 * an implicit narrowing conversion as described in
866 * {@link BigDecimal#doubleValue()}.
867 * @throws NullPointerException when the encoded value is {@code NULL}.
868 * @throws IllegalArgumentException when the encoded value is not a Numeric.
869 * @see #encodeNumeric(PositionedByteRange, double, Order)
870 * @see BigDecimal#doubleValue()
871 */
872 public static double decodeNumericAsDouble(PositionedByteRange src) {
873 // TODO: should an encoded NULL value throw unexpectedHeader() instead?
874 if (isNull(src)) {
875 throw new NullPointerException("A null value cannot be decoded to a double.");
876 }
877 if (isNumericNaN(src)) {
878 src.get();
879 return Double.NaN;
880 }
881 if (isNumericZero(src)) {
882 src.get();
883 return Double.valueOf(0.0);
884 }
885
886 byte header = -1 == Integer.signum(src.peek()) ? DESCENDING.apply(src.peek()) : src.peek();
887
888 if (header == NEG_INF) {
889 src.get();
890 return Double.NEGATIVE_INFINITY;
891 } else if (header == POS_INF) {
892 src.get();
893 return Double.POSITIVE_INFINITY;
894 } else {
895 return decodeNumericValue(src).doubleValue();
896 }
897 }
898
899 /**
900 * Decode a primitive {@code long} value from the Numeric encoding. Numeric
901 * encoding is based on {@link BigDecimal}; in the event the encoded value is
902 * larger than can be represented in a {@code long}, this method performs an
903 * implicit narrowing conversion as described in
904 * {@link BigDecimal#doubleValue()}.
905 * @throws NullPointerException when the encoded value is {@code NULL}.
906 * @throws IllegalArgumentException when the encoded value is not a Numeric.
907 * @see #encodeNumeric(PositionedByteRange, long, Order)
908 * @see BigDecimal#longValue()
909 */
910 public static long decodeNumericAsLong(PositionedByteRange src) {
911 // TODO: should an encoded NULL value throw unexpectedHeader() instead?
912 if (isNull(src)) throw new NullPointerException();
913 if (!isNumeric(src)) throw unexpectedHeader(src.peek());
914 if (isNumericNaN(src)) throw unexpectedHeader(src.peek());
915 if (isNumericInfinite(src)) throw unexpectedHeader(src.peek());
916
917 if (isNumericZero(src)) {
918 src.get();
919 return Long.valueOf(0);
920 }
921 return decodeNumericValue(src).longValue();
922 }
923
924 /**
925 * Decode a {@link BigDecimal} value from the variable-length encoding.
926 * @throws IllegalArgumentException when the encoded value is not a Numeric.
927 * @see #encodeNumeric(PositionedByteRange, BigDecimal, Order)
928 */
929 public static BigDecimal decodeNumericAsBigDecimal(PositionedByteRange src) {
930 if (isNull(src)) {
931 src.get();
932 return null;
933 }
934 if (!isNumeric(src)) throw unexpectedHeader(src.peek());
935 if (isNumericNaN(src)) throw unexpectedHeader(src.peek());
936 if (isNumericInfinite(src)) throw unexpectedHeader(src.peek());
937 return decodeNumericValue(src);
938 }
939
940 /**
941 * Encode a String value. String encoding is 0x00-terminated and so it does
942 * not support {@code \u0000} codepoints in the value.
943 * @param dst The destination to which the encoded value is written.
944 * @param val The value to encode.
945 * @param ord The {@link Order} to respect while encoding {@code val}.
946 * @return the number of bytes written.
947 * @throws IllegalArgumentException when {@code val} contains a {@code \u0000}.
948 */
949 public static int encodeString(PositionedByteRange dst, String val, Order ord) {
950 if (null == val) {
951 return encodeNull(dst, ord);
952 }
953 if (val.contains("\u0000"))
954 throw new IllegalArgumentException("Cannot encode String values containing '\\u0000'");
955 final int offset = dst.getOffset(), start = dst.getPosition();
956 dst.put(TEXT);
957 // TODO: is there no way to decode into dst directly?
958 dst.put(val.getBytes(UTF8));
959 dst.put(TERM);
960 ord.apply(dst.getBytes(), offset + start, dst.getPosition() - start);
961 return dst.getPosition() - start;
962 }
963
964 /**
965 * Decode a String value.
966 */
967 public static String decodeString(PositionedByteRange src) {
968 final byte header = src.get();
969 if (header == NULL || header == DESCENDING.apply(NULL))
970 return null;
971 assert header == TEXT || header == DESCENDING.apply(TEXT);
972 Order ord = header == TEXT ? ASCENDING : DESCENDING;
973 byte[] a = src.getBytes();
974 final int offset = src.getOffset(), start = src.getPosition();
975 final byte terminator = ord.apply(TERM);
976 int i = offset + start;
977 for (; a[i] != terminator; i++)
978 ;
979 src.setPosition(i - offset + 1); // advance position to TERM + 1
980 if (DESCENDING == ord) {
981 // make a copy so that we don't disturb encoded value with ord.
982 byte[] copy = new byte[i - offset - 1];
983 System.arraycopy(a, offset + start, copy, 0, copy.length);
984 ord.apply(copy);
985 return new String(copy, UTF8);
986 } else {
987 return new String(a, offset + start, i - offset - 1, UTF8);
988 }
989 }
990
991 /**
992 * Calculate the expected BlobVar encoded length based on unencoded length.
993 */
994 public static int blobVarEncodedLength(int len) {
995 if (0 == len)
996 return 2; // 1-byte header + 1-byte terminator
997 else
998 return (int)
999 Math.ceil(
1000 (len * 8) // 8-bits per input byte
1001 / 7.0) // 7-bits of input data per encoded byte, rounded up
1002 + 1; // + 1-byte header
1003 }
1004
1005 /**
1006 * Calculate the expected BlobVar decoded length based on encoded length.
1007 */
1008 @VisibleForTesting
1009 static int blobVarDecodedLength(int len) {
1010 return
1011 ((len
1012 - 1) // 1-byte header
1013 * 7) // 7-bits of payload per encoded byte
1014 / 8; // 8-bits per byte
1015 }
1016
1017 /**
1018 * Encode a Blob value using a modified varint encoding scheme.
1019 * <p>
1020 * This format encodes a byte[] value such that no limitations on the input
1021 * value are imposed. The first byte encodes the encoding scheme that
1022 * follows, {@link #BLOB_VAR}. Each encoded byte thereafter consists of a
1023 * header bit followed by 7 bits of payload. A header bit of '1' indicates
1024 * continuation of the encoding. A header bit of '0' indicates this byte
1025 * contains the last of the payload. An empty input value is encoded as the
1026 * header byte immediately followed by a termination byte {@code 0x00}. This
1027 * is not ambiguous with the encoded value of {@code []}, which results in
1028 * {@code [0x80, 0x00]}.
1029 * </p>
1030 * @return the number of bytes written.
1031 */
1032 public static int encodeBlobVar(PositionedByteRange dst, byte[] val, int voff, int vlen,
1033 Order ord) {
1034 if (null == val) {
1035 return encodeNull(dst, ord);
1036 }
1037 // Empty value is null-terminated. All other values are encoded as 7-bits per byte.
1038 assert dst.getRemaining() >= blobVarEncodedLength(vlen) : "buffer overflow expected.";
1039 final int offset = dst.getOffset(), start = dst.getPosition();
1040 dst.put(BLOB_VAR);
1041 if (0 == vlen) {
1042 dst.put(TERM);
1043 } else {
1044 byte s = 1, t = 0;
1045 for (int i = voff; i < vlen; i++) {
1046 dst.put((byte) (0x80 | t | ((val[i] & 0xff) >>> s)));
1047 if (s < 7) {
1048 t = (byte) (val[i] << (7 - s));
1049 s++;
1050 } else {
1051 dst.put((byte) (0x80 | val[i]));
1052 s = 1;
1053 t = 0;
1054 }
1055 }
1056 if (s > 1) {
1057 dst.put((byte) (0x7f & t));
1058 } else {
1059 dst.getBytes()[offset + dst.getPosition() - 1] &= 0x7f;
1060 }
1061 }
1062 ord.apply(dst.getBytes(), offset + start, dst.getPosition() - start);
1063 return dst.getPosition() - start;
1064 }
1065
1066 /**
1067 * Encode a blob value using a modified varint encoding scheme.
1068 * @return the number of bytes written.
1069 * @see #encodeBlobVar(PositionedByteRange, byte[], int, int, Order)
1070 */
1071 public static int encodeBlobVar(PositionedByteRange dst, byte[] val, Order ord) {
1072 return encodeBlobVar(dst, val, 0, null != val ? val.length : 0, ord);
1073 }
1074
1075 /**
1076 * Decode a blob value that was encoded using BlobVar encoding.
1077 */
1078 public static byte[] decodeBlobVar(PositionedByteRange src) {
1079 final byte header = src.get();
1080 if (header == NULL || header == DESCENDING.apply(NULL)) {
1081 return null;
1082 }
1083 assert header == BLOB_VAR || header == DESCENDING.apply(BLOB_VAR);
1084 Order ord = BLOB_VAR == header ? ASCENDING : DESCENDING;
1085 if (src.peek() == ord.apply(TERM)) {
1086 // skip empty input buffer.
1087 src.get();
1088 return new byte[0];
1089 }
1090 final int offset = src.getOffset(), start = src.getPosition();
1091 int end;
1092 byte[] a = src.getBytes();
1093 for (end = start; (byte) (ord.apply(a[offset + end]) & 0x80) != TERM; end++)
1094 ;
1095 end++; // increment end to 1-past last byte
1096 // create ret buffer using length of encoded data + 1 (header byte)
1097 PositionedByteRange ret = new SimplePositionedByteRange(blobVarDecodedLength(end - start + 1));
1098 int s = 6;
1099 byte t = (byte) ((ord.apply(a[offset + start]) << 1) & 0xff);
1100 for (int i = start + 1; i < end; i++) {
1101 if (s == 7) {
1102 ret.put((byte) (t | (ord.apply(a[offset + i]) & 0x7f)));
1103 i++;
1104 } else {
1105 ret.put((byte) (t | ((ord.apply(a[offset + i]) & 0x7f) >>> s)));
1106 }
1107 if (i == end) break;
1108 t = (byte) ((ord.apply(a[offset + i]) << 8 - s) & 0xff);
1109 s = s == 1 ? 7 : s - 1;
1110 }
1111 src.setPosition(end);
1112 assert t == 0 : "Unexpected bits remaining after decoding blob.";
1113 assert ret.getPosition() == ret.getLength() : "Allocated unnecessarily large return buffer.";
1114 return ret.getBytes();
1115 }
1116
1117 /**
1118 * Encode a Blob value as a byte-for-byte copy. BlobCopy encoding in
1119 * DESCENDING order is NULL terminated so as to preserve proper sorting of
1120 * {@code []} and so it does not support {@code 0x00} in the value.
1121 * @return the number of bytes written.
1122 * @throws IllegalArgumentException when {@code ord} is DESCENDING and
1123 * {@code val} contains a {@code 0x00} byte.
1124 */
1125 public static int encodeBlobCopy(PositionedByteRange dst, byte[] val, int voff, int vlen,
1126 Order ord) {
1127 if (null == val) {
1128 encodeNull(dst, ord);
1129 if (ASCENDING == ord) return 1;
1130 else {
1131 // DESCENDING ordered BlobCopy requires a termination bit to preserve
1132 // sort-order semantics of null values.
1133 dst.put(ord.apply(TERM));
1134 return 2;
1135 }
1136 }
1137 // Blobs as final entry in a compound key are written unencoded.
1138 assert dst.getRemaining() >= vlen + (ASCENDING == ord ? 1 : 2);
1139 if (DESCENDING == ord) {
1140 for (int i = 0; i < vlen; i++) {
1141 if (TERM == val[voff + i]) {
1142 throw new IllegalArgumentException("0x00 bytes not permitted in value.");
1143 }
1144 }
1145 }
1146 final int offset = dst.getOffset(), start = dst.getPosition();
1147 dst.put(BLOB_COPY);
1148 dst.put(val, voff, vlen);
1149 // DESCENDING ordered BlobCopy requires a termination bit to preserve
1150 // sort-order semantics of null values.
1151 if (DESCENDING == ord) dst.put(TERM);
1152 ord.apply(dst.getBytes(), offset + start, dst.getPosition() - start);
1153 return dst.getPosition() - start;
1154 }
1155
1156 /**
1157 * Encode a Blob value as a byte-for-byte copy. BlobCopy encoding in
1158 * DESCENDING order is NULL terminated so as to preserve proper sorting of
1159 * {@code []} and so it does not support {@code 0x00} in the value.
1160 * @return the number of bytes written.
1161 * @throws IllegalArgumentException when {@code ord} is DESCENDING and
1162 * {@code val} contains a {@code 0x00} byte.
1163 * @see #encodeBlobCopy(PositionedByteRange, byte[], int, int, Order)
1164 */
1165 public static int encodeBlobCopy(PositionedByteRange dst, byte[] val, Order ord) {
1166 return encodeBlobCopy(dst, val, 0, null != val ? val.length : 0, ord);
1167 }
1168
1169 /**
1170 * Decode a Blob value, byte-for-byte copy.
1171 * @see #encodeBlobCopy(PositionedByteRange, byte[], int, int, Order)
1172 */
1173 public static byte[] decodeBlobCopy(PositionedByteRange src) {
1174 byte header = src.get();
1175 if (header == NULL || header == DESCENDING.apply(NULL)) {
1176 return null;
1177 }
1178 assert header == BLOB_COPY || header == DESCENDING.apply(BLOB_COPY);
1179 Order ord = header == BLOB_COPY ? ASCENDING : DESCENDING;
1180 final int length = src.getRemaining() - (ASCENDING == ord ? 0 : 1);
1181 byte[] ret = new byte[length];
1182 src.get(ret);
1183 ord.apply(ret, 0, ret.length);
1184 // DESCENDING ordered BlobCopy requires a termination bit to preserve
1185 // sort-order semantics of null values.
1186 if (DESCENDING == ord) src.get();
1187 return ret;
1188 }
1189
1190 /**
1191 * Encode a null value.
1192 * @param dst The destination to which encoded digits are written.
1193 * @param ord The {@link Order} to respect while encoding {@code val}.
1194 * @return the number of bytes written.
1195 */
1196 public static int encodeNull(PositionedByteRange dst, Order ord) {
1197 dst.put(ord.apply(NULL));
1198 return 1;
1199 }
1200
1201 /**
1202 * Encode an {@code int32} value using the fixed-length encoding.
1203 * @return the number of bytes written.
1204 * @see #encodeInt64(PositionedByteRange, long, Order)
1205 * @see #decodeInt32(PositionedByteRange)
1206 */
1207 public static int encodeInt32(PositionedByteRange dst, int val, Order ord) {
1208 final int offset = dst.getOffset(), start = dst.getPosition();
1209 dst.put(FIXED_INT32)
1210 .put((byte) ((val >> 24) ^ 0x80))
1211 .put((byte) (val >> 16))
1212 .put((byte) (val >> 8))
1213 .put((byte) val);
1214 ord.apply(dst.getBytes(), offset + start, 5);
1215 return 5;
1216 }
1217
1218 /**
1219 * Decode an {@code int32} value.
1220 * @see #encodeInt32(PositionedByteRange, int, Order)
1221 */
1222 public static int decodeInt32(PositionedByteRange src) {
1223 final byte header = src.get();
1224 assert header == FIXED_INT32 || header == DESCENDING.apply(FIXED_INT32);
1225 Order ord = header == FIXED_INT32 ? ASCENDING : DESCENDING;
1226 int val = (ord.apply(src.get()) ^ 0x80) & 0xff;
1227 for (int i = 1; i < 4; i++) {
1228 val = (val << 8) + (ord.apply(src.get()) & 0xff);
1229 }
1230 return val;
1231 }
1232
1233 /**
1234 * Encode an {@code int64} value using the fixed-length encoding.
1235 * <p>
1236 * This format ensures that all longs sort in their natural order, as they
1237 * would sort when using signed long comparison.
1238 * </p>
1239 * <p>
1240 * All Longs are serialized to an 8-byte, fixed-width sortable byte format.
1241 * Serialization is performed by inverting the integer sign bit and writing
1242 * the resulting bytes to the byte array in big endian order. The encoded
1243 * value is prefixed by the {@link #FIXED_INT64} header byte. This encoding
1244 * is designed to handle java language primitives and so Null values are NOT
1245 * supported by this implementation.
1246 * </p>
1247 * <p>
1248 * For example:
1249 * </p>
1250 * <pre>
1251 * Input: 0x0000000000000005 (5)
1252 * Result: 0x288000000000000005
1253 *
1254 * Input: 0xfffffffffffffffb (-4)
1255 * Result: 0x280000000000000004
1256 *
1257 * Input: 0x7fffffffffffffff (Long.MAX_VALUE)
1258 * Result: 0x28ffffffffffffffff
1259 *
1260 * Input: 0x8000000000000000 (Long.MIN_VALUE)
1261 * Result: 0x287fffffffffffffff
1262 * </pre>
1263 * <p>
1264 * This encoding format, and much of this documentation string, is based on
1265 * Orderly's {@code FixedIntWritableRowKey}.
1266 * </p>
1267 * @return the number of bytes written.
1268 * @see #decodeInt64(PositionedByteRange)
1269 */
1270 public static int encodeInt64(PositionedByteRange dst, long val, Order ord) {
1271 final int offset = dst.getOffset(), start = dst.getPosition();
1272 dst.put(FIXED_INT64)
1273 .put((byte) ((val >> 56) ^ 0x80))
1274 .put((byte) (val >> 48))
1275 .put((byte) (val >> 40))
1276 .put((byte) (val >> 32))
1277 .put((byte) (val >> 24))
1278 .put((byte) (val >> 16))
1279 .put((byte) (val >> 8))
1280 .put((byte) val);
1281 ord.apply(dst.getBytes(), offset + start, 9);
1282 return 9;
1283 }
1284
1285 /**
1286 * Decode an {@code int64} value.
1287 * @see #encodeInt64(PositionedByteRange, long, Order)
1288 */
1289 public static long decodeInt64(PositionedByteRange src) {
1290 final byte header = src.get();
1291 assert header == FIXED_INT64 || header == DESCENDING.apply(FIXED_INT64);
1292 Order ord = header == FIXED_INT64 ? ASCENDING : DESCENDING;
1293 long val = (ord.apply(src.get()) ^ 0x80) & 0xff;
1294 for (int i = 1; i < 8; i++) {
1295 val = (val << 8) + (ord.apply(src.get()) & 0xff);
1296 }
1297 return val;
1298 }
1299
1300 /**
1301 * Encode a 32-bit floating point value using the fixed-length encoding.
1302 * Encoding format is described at length in
1303 * {@link #encodeFloat64(PositionedByteRange, double, Order)}.
1304 * @return the number of bytes written.
1305 * @see #decodeFloat32(PositionedByteRange)
1306 * @see #encodeFloat64(PositionedByteRange, double, Order)
1307 */
1308 public static int encodeFloat32(PositionedByteRange dst, float val, Order ord) {
1309 final int offset = dst.getOffset(), start = dst.getPosition();
1310 int i = Float.floatToIntBits(val);
1311 i ^= ((i >> Integer.SIZE - 1) | Integer.MIN_VALUE);
1312 dst.put(FIXED_FLOAT32)
1313 .put((byte) (i >> 24))
1314 .put((byte) (i >> 16))
1315 .put((byte) (i >> 8))
1316 .put((byte) i);
1317 ord.apply(dst.getBytes(), offset + start, 5);
1318 return 5;
1319 }
1320
1321 /**
1322 * Decode a 32-bit floating point value using the fixed-length encoding.
1323 * @see #encodeFloat32(PositionedByteRange, float, Order)
1324 */
1325 public static float decodeFloat32(PositionedByteRange src) {
1326 final byte header = src.get();
1327 assert header == FIXED_FLOAT32 || header == DESCENDING.apply(FIXED_FLOAT32);
1328 Order ord = header == FIXED_FLOAT32 ? ASCENDING : DESCENDING;
1329 int val = ord.apply(src.get()) & 0xff;
1330 for (int i = 1; i < 4; i++) {
1331 val = (val << 8) + (ord.apply(src.get()) & 0xff);
1332 }
1333 val ^= (~val >> Integer.SIZE - 1) | Integer.MIN_VALUE;
1334 return Float.intBitsToFloat(val);
1335 }
1336
1337 /**
1338 * Encode a 64-bit floating point value using the fixed-length encoding.
1339 * <p>
1340 * This format ensures the following total ordering of floating point
1341 * values: Double.NEGATIVE_INFINITY < -Double.MAX_VALUE < ... <
1342 * -Double.MIN_VALUE < -0.0 < +0.0; < Double.MIN_VALUE < ...
1343 * < Double.MAX_VALUE < Double.POSITIVE_INFINITY < Double.NaN
1344 * </p>
1345 * Floating point numbers are encoded as specified in IEEE 754. A 64-bit
1346 * double precision float consists of a sign bit, 11-bit unsigned exponent
1347 * encoded in offset-1023 notation, and a 52-bit significand. The format is
1348 * described further in the <a
1349 * href="http://en.wikipedia.org/wiki/Double_precision"> Double Precision
1350 * Floating Point Wikipedia page</a> </p>
1351 * <p>
1352 * The value of a normal float is -1 <sup>sign bit</sup> ×
1353 * 2<sup>exponent - 1023</sup> × 1.significand
1354 * </p>
1355 * <p>
1356 * The IEE754 floating point format already preserves sort ordering for
1357 * positive floating point numbers when the raw bytes are compared in most
1358 * significant byte order. This is discussed further at <a href=
1359 * "http://www.cygnus-software.com/papers/comparingfloats/comparingfloats.htm"
1360 * > http://www.cygnus-software.com/papers/comparingfloats/comparingfloats.
1361 * htm</a>
1362 * </p>
1363 * <p>
1364 * Thus, we need only ensure that negative numbers sort in the the exact
1365 * opposite order as positive numbers (so that say, negative infinity is
1366 * less than negative 1), and that all negative numbers compare less than
1367 * any positive number. To accomplish this, we invert the sign bit of all
1368 * floating point numbers, and we also invert the exponent and significand
1369 * bits if the floating point number was negative.
1370 * </p>
1371 * <p>
1372 * More specifically, we first store the floating point bits into a 64-bit
1373 * long {@code l} using {@link Double#doubleToLongBits}. This method
1374 * collapses all NaNs into a single, canonical NaN value but otherwise
1375 * leaves the bits unchanged. We then compute
1376 * </p>
1377 * <pre>
1378 * l ˆ= (l >> (Long.SIZE - 1)) | Long.MIN_SIZE
1379 * </pre>
1380 * <p>
1381 * which inverts the sign bit and XOR's all other bits with the sign bit
1382 * itself. Comparing the raw bytes of {@code l} in most significant
1383 * byte order is equivalent to performing a double precision floating point
1384 * comparison on the underlying bits (ignoring NaN comparisons, as NaNs
1385 * don't compare equal to anything when performing floating point
1386 * comparisons).
1387 * </p>
1388 * <p>
1389 * The resulting long integer is then converted into a byte array by
1390 * serializing the long one byte at a time in most significant byte order.
1391 * The serialized integer is prefixed by a single header byte. All
1392 * serialized values are 9 bytes in length.
1393 * </p>
1394 * <p>
1395 * This encoding format, and much of this highly detailed documentation
1396 * string, is based on Orderly's {@code DoubleWritableRowKey}.
1397 * </p>
1398 * @return the number of bytes written.
1399 * @see #decodeFloat64(PositionedByteRange)
1400 */
1401 public static int encodeFloat64(PositionedByteRange dst, double val, Order ord) {
1402 final int offset = dst.getOffset(), start = dst.getPosition();
1403 long lng = Double.doubleToLongBits(val);
1404 lng ^= ((lng >> Long.SIZE - 1) | Long.MIN_VALUE);
1405 dst.put(FIXED_FLOAT64)
1406 .put((byte) (lng >> 56))
1407 .put((byte) (lng >> 48))
1408 .put((byte) (lng >> 40))
1409 .put((byte) (lng >> 32))
1410 .put((byte) (lng >> 24))
1411 .put((byte) (lng >> 16))
1412 .put((byte) (lng >> 8))
1413 .put((byte) lng);
1414 ord.apply(dst.getBytes(), offset + start, 9);
1415 return 9;
1416 }
1417
1418 /**
1419 * Decode a 64-bit floating point value using the fixed-length encoding.
1420 * @see #encodeFloat64(PositionedByteRange, double, Order)
1421 */
1422 public static double decodeFloat64(PositionedByteRange src) {
1423 final byte header = src.get();
1424 assert header == FIXED_FLOAT64 || header == DESCENDING.apply(FIXED_FLOAT64);
1425 Order ord = header == FIXED_FLOAT64 ? ASCENDING : DESCENDING;
1426 long val = ord.apply(src.get()) & 0xff;
1427 for (int i = 1; i < 8; i++) {
1428 val = (val << 8) + (ord.apply(src.get()) & 0xff);
1429 }
1430 val ^= (~val >> Long.SIZE - 1) | Long.MIN_VALUE;
1431 return Double.longBitsToDouble(val);
1432 }
1433
1434 /**
1435 * Returns true when {@code src} appears to be positioned an encoded value,
1436 * false otherwise.
1437 */
1438 public static boolean isEncodedValue(PositionedByteRange src) {
1439 return isNull(src) || isNumeric(src) || isFixedInt32(src) || isFixedInt64(src)
1440 || isFixedFloat32(src) || isFixedFloat64(src) || isText(src) || isBlobCopy(src)
1441 || isBlobVar(src);
1442 }
1443
1444 /**
1445 * Return true when the next encoded value in {@code src} is null, false
1446 * otherwise.
1447 */
1448 public static boolean isNull(PositionedByteRange src) {
1449 return NULL ==
1450 (-1 == Integer.signum(src.peek()) ? DESCENDING : ASCENDING).apply(src.peek());
1451 }
1452
1453 /**
1454 * Return true when the next encoded value in {@code src} uses Numeric
1455 * encoding, false otherwise. {@code NaN}, {@code +/-Inf} are valid Numeric
1456 * values.
1457 */
1458 public static boolean isNumeric(PositionedByteRange src) {
1459 byte x = (-1 == Integer.signum(src.peek()) ? DESCENDING : ASCENDING).apply(src.peek());
1460 return x >= NEG_INF && x <= NAN;
1461 }
1462
1463 /**
1464 * Return true when the next encoded value in {@code src} uses Numeric
1465 * encoding and is {@code Infinite}, false otherwise.
1466 */
1467 public static boolean isNumericInfinite(PositionedByteRange src) {
1468 byte x = (-1 == Integer.signum(src.peek()) ? DESCENDING : ASCENDING).apply(src.peek());
1469 return NEG_INF == x || POS_INF == x;
1470 }
1471
1472 /**
1473 * Return true when the next encoded value in {@code src} uses Numeric
1474 * encoding and is {@code NaN}, false otherwise.
1475 */
1476 public static boolean isNumericNaN(PositionedByteRange src) {
1477 return NAN == (-1 == Integer.signum(src.peek()) ? DESCENDING : ASCENDING).apply(src.peek());
1478 }
1479
1480 /**
1481 * Return true when the next encoded value in {@code src} uses Numeric
1482 * encoding and is {@code 0}, false otherwise.
1483 */
1484 public static boolean isNumericZero(PositionedByteRange src) {
1485 return ZERO ==
1486 (-1 == Integer.signum(src.peek()) ? DESCENDING : ASCENDING).apply(src.peek());
1487 }
1488
1489 /**
1490 * Return true when the next encoded value in {@code src} uses fixed-width
1491 * Int32 encoding, false otherwise.
1492 */
1493 public static boolean isFixedInt32(PositionedByteRange src) {
1494 return FIXED_INT32 ==
1495 (-1 == Integer.signum(src.peek()) ? DESCENDING : ASCENDING).apply(src.peek());
1496 }
1497
1498 /**
1499 * Return true when the next encoded value in {@code src} uses fixed-width
1500 * Int64 encoding, false otherwise.
1501 */
1502 public static boolean isFixedInt64(PositionedByteRange src) {
1503 return FIXED_INT64 ==
1504 (-1 == Integer.signum(src.peek()) ? DESCENDING : ASCENDING).apply(src.peek());
1505 }
1506
1507 /**
1508 * Return true when the next encoded value in {@code src} uses fixed-width
1509 * Float32 encoding, false otherwise.
1510 */
1511 public static boolean isFixedFloat32(PositionedByteRange src) {
1512 return FIXED_FLOAT32 ==
1513 (-1 == Integer.signum(src.peek()) ? DESCENDING : ASCENDING).apply(src.peek());
1514 }
1515
1516 /**
1517 * Return true when the next encoded value in {@code src} uses fixed-width
1518 * Float64 encoding, false otherwise.
1519 */
1520 public static boolean isFixedFloat64(PositionedByteRange src) {
1521 return FIXED_FLOAT64 ==
1522 (-1 == Integer.signum(src.peek()) ? DESCENDING : ASCENDING).apply(src.peek());
1523 }
1524
1525 /**
1526 * Return true when the next encoded value in {@code src} uses Text encoding,
1527 * false otherwise.
1528 */
1529 public static boolean isText(PositionedByteRange src) {
1530 return TEXT ==
1531 (-1 == Integer.signum(src.peek()) ? DESCENDING : ASCENDING).apply(src.peek());
1532 }
1533
1534 /**
1535 * Return true when the next encoded value in {@code src} uses BlobVar
1536 * encoding, false otherwise.
1537 */
1538 public static boolean isBlobVar(PositionedByteRange src) {
1539 return BLOB_VAR ==
1540 (-1 == Integer.signum(src.peek()) ? DESCENDING : ASCENDING).apply(src.peek());
1541 }
1542
1543 /**
1544 * Return true when the next encoded value in {@code src} uses BlobCopy
1545 * encoding, false otherwise.
1546 */
1547 public static boolean isBlobCopy(PositionedByteRange src) {
1548 return BLOB_COPY ==
1549 (-1 == Integer.signum(src.peek()) ? DESCENDING : ASCENDING).apply(src.peek());
1550 }
1551
1552 /**
1553 * Skip {@code buff}'s position forward over one encoded value.
1554 * @return number of bytes skipped.
1555 */
1556 public static int skip(PositionedByteRange src) {
1557 final int start = src.getPosition();
1558 byte header = src.get();
1559 Order ord = (-1 == Integer.signum(header)) ? DESCENDING : ASCENDING;
1560 header = ord.apply(header);
1561
1562 switch (header) {
1563 case NULL:
1564 case NEG_INF:
1565 return 1;
1566 case NEG_LARGE: /* Large negative number: 0x08, ~E, ~M */
1567 skipVaruint64(src, DESCENDING != ord);
1568 skipSignificand(src, DESCENDING != ord);
1569 return src.getPosition() - start;
1570 case NEG_MED_MIN: /* Medium negative number: 0x13-E, ~M */
1571 case NEG_MED_MIN + 0x01:
1572 case NEG_MED_MIN + 0x02:
1573 case NEG_MED_MIN + 0x03:
1574 case NEG_MED_MIN + 0x04:
1575 case NEG_MED_MIN + 0x05:
1576 case NEG_MED_MIN + 0x06:
1577 case NEG_MED_MIN + 0x07:
1578 case NEG_MED_MIN + 0x08:
1579 case NEG_MED_MIN + 0x09:
1580 case NEG_MED_MAX:
1581 skipSignificand(src, DESCENDING != ord);
1582 return src.getPosition() - start;
1583 case NEG_SMALL: /* Small negative number: 0x14, -E, ~M */
1584 skipVaruint64(src, DESCENDING == ord);
1585 skipSignificand(src, DESCENDING != ord);
1586 return src.getPosition() - start;
1587 case ZERO:
1588 return 1;
1589 case POS_SMALL: /* Small positive number: 0x16, ~-E, M */
1590 skipVaruint64(src, DESCENDING != ord);
1591 skipSignificand(src, DESCENDING == ord);
1592 return src.getPosition() - start;
1593 case POS_MED_MIN: /* Medium positive number: 0x17+E, M */
1594 case POS_MED_MIN + 0x01:
1595 case POS_MED_MIN + 0x02:
1596 case POS_MED_MIN + 0x03:
1597 case POS_MED_MIN + 0x04:
1598 case POS_MED_MIN + 0x05:
1599 case POS_MED_MIN + 0x06:
1600 case POS_MED_MIN + 0x07:
1601 case POS_MED_MIN + 0x08:
1602 case POS_MED_MIN + 0x09:
1603 case POS_MED_MAX:
1604 skipSignificand(src, DESCENDING == ord);
1605 return src.getPosition() - start;
1606 case POS_LARGE: /* Large positive number: 0x22, E, M */
1607 skipVaruint64(src, DESCENDING == ord);
1608 skipSignificand(src, DESCENDING == ord);
1609 return src.getPosition() - start;
1610 case POS_INF:
1611 return 1;
1612 case NAN:
1613 return 1;
1614 case FIXED_INT32:
1615 src.setPosition(src.getPosition() + 4);
1616 return src.getPosition() - start;
1617 case FIXED_INT64:
1618 src.setPosition(src.getPosition() + 8);
1619 return src.getPosition() - start;
1620 case FIXED_FLOAT32:
1621 src.setPosition(src.getPosition() + 4);
1622 return src.getPosition() - start;
1623 case FIXED_FLOAT64:
1624 src.setPosition(src.getPosition() + 8);
1625 return src.getPosition() - start;
1626 case TEXT:
1627 // for null-terminated values, skip to the end.
1628 do {
1629 header = ord.apply(src.get());
1630 } while (header != TERM);
1631 return src.getPosition() - start;
1632 case BLOB_VAR:
1633 // read until we find a 0 in the MSB
1634 do {
1635 header = ord.apply(src.get());
1636 } while ((byte) (header & 0x80) != TERM);
1637 return src.getPosition() - start;
1638 case BLOB_COPY:
1639 if (Order.DESCENDING == ord) {
1640 // if descending, read to termination byte.
1641 do {
1642 header = ord.apply(src.get());
1643 } while (header != TERM);
1644 return src.getPosition() - start;
1645 } else {
1646 // otherwise, just skip to the end.
1647 src.setPosition(src.getLength());
1648 return src.getPosition() - start;
1649 }
1650 default:
1651 throw unexpectedHeader(header);
1652 }
1653 }
1654
1655 /**
1656 * Return the number of encoded entries remaining in {@code buff}. The
1657 * state of {@code buff} is not modified through use of this method.
1658 */
1659 public static int length(PositionedByteRange buff) {
1660 PositionedByteRange b =
1661 new SimplePositionedByteRange(buff.getBytes(), buff.getOffset(), buff.getLength());
1662 b.setPosition(buff.getPosition());
1663 int cnt = 0;
1664 for (; isEncodedValue(b); skip(buff), cnt++)
1665 ;
1666 return cnt;
1667 }
1668 }