org.apache.lucene.util
public class: IndexableBinaryStringTools [javadoc |
source]
java.lang.Object
org.apache.lucene.util.IndexableBinaryStringTools
Provides support for converting byte sequences to Strings and back again.
The resulting Strings preserve the original byte sequences' sort order.
The Strings are constructed using a Base 8000h encoding of the original
binary data - each char of an encoded String represents a 15-bit chunk
from the byte sequence. Base 8000h was chosen because it allows for all
lower 15 bits of char to be used without restriction; the surrogate range
[U+D8000-U+DFFF] does not represent valid chars, and would require
complicated handling to avoid them and allow use of char's high bit.
Although unset bits are used as padding in the final char, the original
byte sequence could contain trailing bytes with no set bits (null bytes):
padding is indistinguishable from valid information. To overcome this
problem, a char is appended, indicating the number of encoded bytes in the
final content char.
This class's operations are defined over CharBuffers and ByteBuffers, to
allow for wrapped arrays to be reused, reducing memory allocation costs for
repeated operations. Note that this class calls array() and arrayOffset()
on the CharBuffers and ByteBuffers it uses, so only wrapped arrays may be
used. This class interprets the arrayOffset() and limit() values returned by
its input buffers as beginning and end+1 positions on the wrapped array,
respectively; similarly, on the output buffer, arrayOffset() is the first
position written to, and limit() is set to one past the final output array
position.
Methods from java.lang.Object: |
---|
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait |
Method from org.apache.lucene.util.IndexableBinaryStringTools Detail: |
public static ByteBuffer decode(CharBuffer input) {
byte[] outputArray = new byte[getDecodedLength(input)];
ByteBuffer output = ByteBuffer.wrap(outputArray);
decode(input, output);
return output;
}
|
public static void decode(CharBuffer input,
ByteBuffer output) {
if (input.hasArray() && output.hasArray()) {
int numInputChars = input.limit() - input.arrayOffset() - 1;
int numOutputBytes = getDecodedLength(input);
output.limit(numOutputBytes + output.arrayOffset()); // Set output final pos + 1
output.position(0);
byte[] outputArray = output.array();
char[] inputArray = input.array();
if (numOutputBytes > 0) {
int caseNum = 0;
int outputByteNum = output.arrayOffset();
int inputCharNum = input.arrayOffset();
short inputChar;
CodingCase codingCase;
for ( ; inputCharNum < numInputChars - 1 ; ++inputCharNum) {
codingCase = CODING_CASES[caseNum];
inputChar = (short)inputArray[inputCharNum];
if (2 == codingCase.numBytes) {
if (0 == caseNum) {
outputArray[outputByteNum] = (byte)(inputChar > > > codingCase.initialShift);
} else {
outputArray[outputByteNum] += (byte)(inputChar > > > codingCase.initialShift);
}
outputArray[outputByteNum + 1] = (byte)((inputChar & codingCase.finalMask)
< < codingCase.finalShift);
} else { // numBytes is 3
outputArray[outputByteNum] += (byte)(inputChar > > > codingCase.initialShift);
outputArray[outputByteNum + 1] = (byte)((inputChar & codingCase.middleMask)
> > > codingCase.middleShift);
outputArray[outputByteNum + 2] = (byte)((inputChar & codingCase.finalMask)
< < codingCase.finalShift);
}
outputByteNum += codingCase.advanceBytes;
if (++caseNum == CODING_CASES.length) {
caseNum = 0;
}
}
// Handle final char
inputChar = (short)inputArray[inputCharNum];
codingCase = CODING_CASES[caseNum];
if (0 == caseNum) {
outputArray[outputByteNum] = 0;
}
outputArray[outputByteNum] += (byte)(inputChar > > > codingCase.initialShift);
int bytesLeft = numOutputBytes - outputByteNum;
if (bytesLeft > 1) {
if (2 == codingCase.numBytes) {
outputArray[outputByteNum + 1] = (byte)((inputChar & codingCase.finalMask)
> > > codingCase.finalShift);
} else { // numBytes is 3
outputArray[outputByteNum + 1] = (byte)((inputChar & codingCase.middleMask)
> > > codingCase.middleShift);
if (bytesLeft > 2) {
outputArray[outputByteNum + 2] = (byte)((inputChar & codingCase.finalMask)
< < codingCase.finalShift);
}
}
}
}
} else {
throw new IllegalArgumentException("Arguments must have backing arrays");
}
}
Decodes the input char sequence into the output byte sequence. Before
calling this method, ensure that the output ByteBuffer has sufficient
capacity by calling #getDecodedLength(java.nio.CharBuffer) . |
public static CharBuffer encode(ByteBuffer input) {
char[] outputArray = new char[getEncodedLength(input)];
CharBuffer output = CharBuffer.wrap(outputArray);
encode(input, output);
return output;
}
Encodes the input byte sequence. |
public static void encode(ByteBuffer input,
CharBuffer output) {
if (input.hasArray() && output.hasArray()) {
byte[] inputArray = input.array();
int inputOffset = input.arrayOffset();
int inputLength = input.limit() - inputOffset;
char[] outputArray = output.array();
int outputOffset = output.arrayOffset();
int outputLength = getEncodedLength(input);
output.limit(outputOffset + outputLength); // Set output final pos + 1
output.position(0);
if (inputLength > 0) {
int inputByteNum = inputOffset;
int caseNum = 0;
int outputCharNum = outputOffset;
CodingCase codingCase;
for ( ; inputByteNum + CODING_CASES[caseNum].numBytes < = inputLength ;
++outputCharNum ) {
codingCase = CODING_CASES[caseNum];
if (2 == codingCase.numBytes) {
outputArray[outputCharNum]
= (char)(((inputArray[inputByteNum] & 0xFF) < < codingCase.initialShift)
+ (((inputArray[inputByteNum + 1] & 0xFF) > > > codingCase.finalShift)
& codingCase.finalMask)
& (short)0x7FFF);
} else { // numBytes is 3
outputArray[outputCharNum]
= (char)(((inputArray[inputByteNum] & 0xFF) < < codingCase.initialShift)
+ ((inputArray[inputByteNum + 1] & 0xFF) < < codingCase.middleShift)
+ (((inputArray[inputByteNum + 2] & 0xFF) > > > codingCase.finalShift)
& codingCase.finalMask)
& (short)0x7FFF);
}
inputByteNum += codingCase.advanceBytes;
if (++caseNum == CODING_CASES.length) {
caseNum = 0;
}
}
// Produce final char (if any) and trailing count chars.
codingCase = CODING_CASES[caseNum];
if (inputByteNum + 1 < inputLength) { // codingCase.numBytes must be 3
outputArray[outputCharNum++]
= (char)((((inputArray[inputByteNum] & 0xFF) < < codingCase.initialShift)
+ ((inputArray[inputByteNum + 1] & 0xFF) < < codingCase.middleShift))
& (short)0x7FFF);
// Add trailing char containing the number of full bytes in final char
outputArray[outputCharNum++] = (char)1;
} else if (inputByteNum < inputLength) {
outputArray[outputCharNum++]
= (char)(((inputArray[inputByteNum] & 0xFF) < < codingCase.initialShift)
& (short)0x7FFF);
// Add trailing char containing the number of full bytes in final char
outputArray[outputCharNum++] = caseNum == 0 ? (char)1 : (char)0;
} else { // No left over bits - last char is completely filled.
// Add trailing char containing the number of full bytes in final char
outputArray[outputCharNum++] = (char)1;
}
}
} else {
throw new IllegalArgumentException("Arguments must have backing arrays");
}
}
Encodes the input byte sequence into the output char sequence. Before
calling this method, ensure that the output CharBuffer has sufficient
capacity by calling #getEncodedLength(java.nio.ByteBuffer) . |
public static int getDecodedLength(CharBuffer encoded) throws IllegalArgumentException {
if (encoded.hasArray()) {
int numChars = encoded.limit() - encoded.arrayOffset() - 1;
if (numChars < = 0) {
return 0;
} else {
int numFullBytesInFinalChar = encoded.charAt(encoded.limit() - 1);
int numEncodedChars = numChars - 1;
return (numEncodedChars * 15 + 7) / 8 + numFullBytesInFinalChar;
}
} else {
throw new IllegalArgumentException("encoded argument must have a backing array");
}
}
Returns the number of bytes required to decode the given char sequence. |
public static int getEncodedLength(ByteBuffer original) throws IllegalArgumentException {
if (original.hasArray()) {
// Use long for intermediaries to protect against overflow
long length = (long)(original.limit() - original.arrayOffset());
return (int)((length * 8L + 14L) / 15L) + 1;
} else {
throw new IllegalArgumentException("original argument must have a backing array");
}
}
Returns the number of chars required to encode the given byte sequence. |