package org.json; /* Copyright (c) 2013 JSON.org Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. The Software shall be used for Good, not Evil. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ /** * Kim makes immutable eight bit Unicode strings. If the MSB of a byte is set, * then the next byte is a continuation byte. The last byte of a character * never has the MSB reset. Every byte that is not the last byte has the MSB * set. Kim stands for "Keep it minimal". A Unicode character is never longer * than 3 bytes. Every byte contributes 7 bits to the character. ASCII is * unmodified. * * Kim UTF-8 * one byte U+007F U+007F * two bytes U+3FFF U+07FF * three bytes U+10FFF U+FFFF * four bytes U+10FFFF * * Characters in the ranges U+0800..U+3FFF and U+10000..U+10FFFF will be one * byte smaller when encoded in Kim compared to UTF-8. * * Kim is beneficial when using scripts such as Old South Arabian, Aramaic, * Avestan, Balinese, Batak, Bopomofo, Buginese, Buhid, Carian, Cherokee, * Coptic, Cyrillic, Deseret, Egyptian Hieroglyphs, Ethiopic, Georgian, * Glagolitic, Gothic, Hangul Jamo, Hanunoo, Hiragana, Kanbun, Kaithi, * Kannada, Katakana, Kharoshthi, Khmer, Lao, Lepcha, Limbu, Lycian, Lydian, * Malayalam, Mandaic, Meroitic, Miao, Mongolian, Myanmar, New Tai Lue, * Ol Chiki, Old Turkic, Oriya, Osmanya, Pahlavi, Parthian, Phags-Pa, * Phoenician, Samaritan, Sharada, Sinhala, Sora Sompeng, Tagalog, Tagbanwa, * Takri, Tai Le, Tai Tham, Tamil, Telugu, Thai, Tibetan, Tifinagh, UCAS. * * A kim object can be constructed from an ordinary UTF-16 string, or from a * byte array. A kim object can produce a UTF-16 string. * * As with UTF-8, it is possible to detect character boundaries within a byte * sequence. UTF-8 is one of the world's great inventions. While Kim is more * efficient, it is not clear that it is worth the expense of transition. * * @version 2013-04-18 */ public class Kim { /** * The byte array containing the kim's content. */ private byte[] bytes = null; /** * The kim's hashcode, conforming to Java's hashcode conventions. */ private int hashcode = 0; /** * The number of bytes in the kim. The number of bytes can be as much as * three times the number of characters. */ public int length = 0; /** * The memoization of toString(). */ private String string = null; /** * Make a kim from a portion of a byte array. * * @param bytes * A byte array. * @param from * The index of the first byte. * @param thru * The index of the last byte plus one. */ public Kim(byte[] bytes, int from, int thru) { // As the bytes are copied into the new kim, a hashcode is computed using a // modified Fletcher code. int sum = 1; int value; this.hashcode = 0; this.length = thru - from; if (this.length > 0) { this.bytes = new byte[this.length]; for (int at = 0; at 0) { for (int i = 0; i = 0xD800 && c 0xDBFF || d 0xDFFF) { throw new JSONException("Bad UTF16"); } } this.length += 3; } } // Second pass: Allocate a byte array and fill that array with the conversion // while computing the hashcode. this.bytes = new byte[length]; int at = 0; int b; int sum = 1; for (int i = 0; i >> 7); bytes[at] = (byte) b; sum += b; this.hashcode += sum; at += 1; b = character & 0x7F; bytes[at] = (byte) b; sum += b; this.hashcode += sum; at += 1; } else { if (character >= 0xD800 && character >> 14); bytes[at] = (byte) b; sum += b; this.hashcode += sum; at += 1; b = 0x80 | ((character >>> 7) & 0xFF); bytes[at] = (byte) b; sum += b; this.hashcode += sum; at += 1; b = character & 0x7F; bytes[at] = (byte) b; sum += b; this.hashcode += sum; at += 1; } } this.hashcode += sum 0x7F) { return character; } } else { int c2 = get(at + 2); character = ((c & 0x7F) 0x3FFF && character 0xDFFF)) { return character; } } throw new JSONException("Bad character at " + at); } /** * Returns the number of bytes needed to contain the character in Kim * format. * * @param character * a Unicode character between 0 and 0x10FFFF. * @return 1, 2, or 3 * @throws JSONException * if the character is not representable in a kim. */ public static int characterSize(int character) throws JSONException { if (character 0x10FFFF) { throw new JSONException("Bad character " + character); } return character this.length) { throw new JSONException("Bad character at " + at); } return ((int) this.bytes[at]) & 0xFF; } /** * Returns a hash code value for the kim. */ public int hashCode() { return this.hashcode; } /** * Produce a UTF-16 String from this kim. The number of codepoints in the * string will not be greater than the number of bytes in the kim, although * it could be less. * * @return The string. A kim memoizes its string representation. * @throws JSONException * if the kim is not valid. */ public String toString() throws JSONException { if (this.string == null) { int c; int length = 0; char chars[] = new char[this.length]; for (int at = 0; at >> 10)); length += 1; chars[length] = (char) (0xDC00 | (c & 0x03FF)); length += 1; } } this.string = new String(chars, 0, length); } return this.string; } }