/* Validity checks for UTF8 byte codes * * UTF8.h * * Created by Sunrise Telephone Systems Ltd. * * This file ("UTF8.h") is hereby placed into the public domain. * */ // --------------------------------------------------------------------------- // Macro IS_LEGAL_UTF8_LEAD_BYTE(x) // --------------------------------------------------------------------------- // // RFC3629 defines UTF8 lead bytes to be in the range of 0 .. 7F and C2 .. F4. // This macro returns true if byte x is a legal UTF8 lead byte, otherwise // false // #define IS_LEGAL_UTF8_LEAD_BYTE(x) \ (((x > 0) && (x < 0x80)) || ((x > 0xC1) && (x <= 0xF5))) // --------------------------------------------------------------------------- // Macro IS_ILLEGAL_UTF8_LEAD_BYTE(x) // --------------------------------------------------------------------------- // // RFC3629 excludes the use of legacy UTF8 lead bytes in the range of C0 .. C1 // and F5 .. FF. This macro returns true if byte x is an illegal UTF8 lead // byte, otherwise false. // #define IS_ILLEGAL_UTF8_LEAD_BYTE(x) \ ((x == 0xC0) || (x == 0xC1) || ((x >= 0xF5) && (x <= 0xFF))) // --------------------------------------------------------------------------- // Macro IS_LEGAL_UTF8_2ND_BYTE(lead byte, 2nd byte) // --------------------------------------------------------------------------- // // RFC3629 defines the ranges for the second byte in UTF8 multi-byte sequence // as follows: // // lead byte 2nd byte // C2 .. DF 80 .. BF // E0 A0 .. BF // E1 .. EC 80 .. BF // ED 80 .. 9F // EE .. EF 80 .. BF // F0 90 .. BF // F1 .. F3 80 .. BF // F4 80 .. 8F // // This macro returns true if 'follow' is a legal 2nd byte following lead // byte 'lead' in a UTF8 multi-byte sequence, otherwise false. // #define IS_LEGAL_UTF8_2ND_BYTE(lead, follow) \ (((((lead >= 0xC2) && (lead <= 0xDF)) || \ ((lead >= 0xE1) && (lead <= 0xEC)) || \ ((lead >= 0xEE) && (lead <= 0xEF)) || \ ((lead >= 0xF1) && (lead <= 0xF3))) && \ (follow >= 0x80) && (follow <= 0xBF)) || \ ((lead == 0xE0) && (follow >= 0xA0) && (follow <= 0xBF)) || \ ((lead == 0xED) && (follow >= 0x80) && (follow <= 0x9F)) || \ ((lead == 0xF0) && (follow >= 0x90) && (follow <= 0xBF)) || \ ((lead == 0xF4) && (follow >= 0x80) && (follow <= 0x8F))) // --------------------------------------------------------------------------- // Macro IS_LEGAL_UTF8_TAIL_BYTE(x) // --------------------------------------------------------------------------- // // RFC3629 mandates that any bytes following a lead byte must be in the range // of 80 .. BF. This macro returns true if byte x is an legal UTF8 follow-on // byte, otherwise false. // #define IS_LEGAL_UTF8_TAIL_BYTE(x) \ ((x >= 0x80) && (x <= 0xBF)) // --------------------------------------------------------------------------- // Macro IS_UTF8_LEN1(x) // --------------------------------------------------------------------------- // // Returns true if byte x is a single byte UTF8 character, otherwise false. // #define IS_UTF8_LEN1(x) \ ((x & 0x80) == 0) // --------------------------------------------------------------------------- // Macro IS_UTF8_LEN2(x) // --------------------------------------------------------------------------- // // Returns true if byte x is a lead byte of a two byte UTF8 sequence, // otherwise false. // #define IS_UTF8_LEN2(x) \ ((x >= 0xC2) && (x <= 0xDF)) // --------------------------------------------------------------------------- // Macro IS_UTF8_LEN3(x) // --------------------------------------------------------------------------- // // Returns true if byte x is a lead byte of a three byte UTF8 sequence, // otherwise false. // #define IS_UTF8_LEN3(x) \ ((x >= 0xE0) && (x <= 0xEF)) // --------------------------------------------------------------------------- // Macro IS_UTF8_LEN4(x) // --------------------------------------------------------------------------- // // Returns true if byte x is a lead byte of a four byte UTF8 sequence, // otherwise false. // #define IS_UTF8_LEN4(x) \ ((x >= 0xF0) && (x <= 0xF5)) // --------------------------------------------------------------------------- // Macro UTF8_LENGTH(x) // --------------------------------------------------------------------------- // // RFC3629 defines the length of legal UTF8 byte sequences as follows: // // lead byte length of sequence // 00 .. 7F 1 byte // C2 .. DF 2 bytes // E0 .. EF 3 bytes // F0 .. F4 4 bytes // // This macro returns the length of a UTF8 sequence for lead byte x. It // returns 0 if x is not a legal UTF8 lead byte. // #define UTF8_LENGTH(x) \ ((IS_UTF8_LEN1(x))?(1): \ (((IS_UTF8_LEN2(x))?(2): \ (((IS_UTF8_LEN3(x))?(3): \ (((IS_UTF8_LEN4(x))?(4): \ (0)))))))) // END OF FILE