/* Asterisk INI to Property List File Conversion Utility -- Version 1.10 * * scanner.c * aini2plist * * Lexical analyser for Asterisk INI configuration files. * * Author: Benjamin Kowarsch * * (C) 2005, 2006 Sunrise Telephone Systems Ltd. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included * in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. * * In countries and territories where the above no-warranty disclaimer is * not permissible by applicable law, the following terms apply: * * NO PERMISSION TO USE THE SOFTWARE IS GRANTED AND THE SOFTWARE MUST NOT BE * USED AT ALL IN SUCH COUNTRIES AND TERRITORIES WHERE THE ABOVE NO-WARRANTY * DISCLAIMER IS NOT PERMISSIBLE AND INVALIDATED BY APPLICABLE LAW. HOWEVER, * THE COPYRIGHT HOLDERS HEREBY WAIVE THEIR RIGHT TO PURSUE OFFENDERS AS LONG * AS THEY OTHERWISE ABIDE BY THE TERMS OF THE LICENSE AS APPLICABLE FOR USE * OF THE SOFTWARE IN COUNTRIES AND TERRITORIES WHERE THE ABOVE NO-WARRANTY * DISCLAIMER IS PERMITTED BY APPLICABLE LAW. THIS WAIVER DOES NOT CONSTITUTE * A LICENSE TO USE THE SOFTWARE IN COUNTRIES AND TERRITORIES WHERE THE ABOVE * NO-WARRANTY DISCLAIMER IS NOT PERMISSIBLE AND INVALIDATED BY APPLICABLE * LAW. ANY LIABILITY OF ANY KIND IS CATEGORICALLY RULED OUT AT ALL TIMES. */ #include #include #include #include #include #include "UTF8.h" #include "ASCII.h" #include "pathnames.h" #include "globaldefs.h" #include "scanner.h" // --------------------------------------------------------------------------- // CHAR type definition // --------------------------------------------------------------------------- // // Characters are *unsigned*, let's not be folly and treat them any other way. typedef unsigned char CHAR; // --------------------------------------------------------------------------- // Lexical parameters // --------------------------------------------------------------------------- // // maximum length allowed for indentifiers #define MAX_IDENT_LEN MAXIMUM_LENGTH_FOR_IDENTIFIERS // maximum length allowed for comments #define MAX_COMMENT_LEN MAXIMUM_LENGTH_FOR_VALUES // maximum length allowed for directives #define MAX_DIRECTIVE_LEN MAXIMUM_LENGTH_FOR_IDENTIFIERS // maximum length allowed for plain values #define MAX_PLAIN_VALUE_LEN MAXIMUM_LENGTH_FOR_VALUES // maximum length allowed for quoted values #define MAX_QUOTED_VALUE_LEN MAXIMUM_LENGTH_FOR_STRINGS // maximum length allowed for bracketed values #define MAX_BRACKETED_VALUE_LEN 24 // maximum length allowed for argument lists #define MAX_ARGUMENT_LIST_LEN MAXIMUM_LENGTH_FOR_VALUES // --------------------------------------------------------------------------- // Directives // --------------------------------------------------------------------------- // // hash value for keyword "include" #define INCLUDE_FILE 0x7DF798E8 // hash value for keyword "allow_utf8" #define ALLOW_UTF8 0x694B2587 // hash value for keyword "yes" #define YES_VALUE 0x3BFA6D47 // hash value for keyword "no" #define NO_VALUE 0x006E1B81 // hash value for keyword "true" #define TRUE_VALUE 0x4DAE9B2E // hash value for keyword "false" #define FALSE_VALUE 0x66E499E3 // hash value for keyword "callerid" #define CALLERID_KEY 0x4C4993C6 // hash value for keyword "inkeys" #define INKEYS_KEY 0x6C533139 // hash value for keyword "regexten" #define REGEXTEN_KEY 0x5DA3D4F6 // --------------------------------------------------------------------------- // Lexeme buffer size // --------------------------------------------------------------------------- // // determine the buffer size required for the largest lexeme #define LEX_BUFFER_CAPACITY \ MAX6(MAX_IDENT_LEN, MAX_COMMENT_LEN, MAX_PLAIN_VALUE_LEN, \ MAX_QUOTED_VALUE_LEN, MAX_BRACKETED_VALUE_LEN, MAX_ARGUMENT_LIST_LEN) + 1 // --------------------------------------------------------------------------- // File include parameter stack // --------------------------------------------------------------------------- #define MAX_INCLUDE_LEVEL 8 typedef /* IncludeFileEntry */ struct { CHAR *path; FILE *file; ScannerPosition position; } IncludeStackObj; // --------------------------------------------------------------------------- // Scanner state type definition // --------------------------------------------------------------------------- typedef /* Scanner */ struct { bool initialized; // initialisation status bool end_of_file; // end-of-file flag bool lefthandSide; // left of assignment operator bool allowUTF8; // UTF8 allowed in quoted values bool ampValSep; // treat & as value separator bool colonValSep; // treat : as value separator bool allowBracketValue; // allow bracketed values CHAR *filename; // name of source file FILE *sourcefile; // handle for source file ScannerPosition currentPos; // current position CHAR *lexbuf; // pointer to lexeme buffer ScannerToken lastSym; // last symbol found CARDINAL lastIdentHash; // hash value of last ident symbol ScannerPosition lastSymPos; // position of last symbol found ScannerStatus lastStatus; // status of last operation bool illegal_chars_skipped; // illegal characters were skipped bool excess_chars_ignored; // excess characters were ignored bool auto_terminated; // symbol was terminated by EOL or EOF CARDINAL includeLevel; // file include level and stack IncludeStackObj includeStack[MAX_INCLUDE_LEVEL]; } Scanner; // --------------------------------------------------------------------------- // Scanner state variable // --------------------------------------------------------------------------- static Scanner scanner = { /* initialized */ false, /* end_of_file */ false, /* lefthandSide */ true, /* allowUTF8 */ true, /* ampValSep */ false, /* colonValSep */ false, /* allowBracketValue */ false, /* filename */ NULL, /* sourcefile */ NULL, /* currentPos */ { 0, 0 }, /* lexbuf */ NULL, /* lastSym */ NO_TOKEN, /* lastIdentHash */ 0, /* LastSymPos */ { 0, 0 }, /* lastStatus */ SCANNER_STATUS_UNDEFINED, /* illegal_chars_skipped */ false, /* excess_chars_ignored */ false, /* auto_terminated */ false, /* includeLevel */ 0 }; // --------------------------------------------------------------------------- // Human readable names of tokens // --------------------------------------------------------------------------- // Microsoft compiler people are either uneducated or lazy or arrogant or any // combination thereof. While ANSI C allows struct initialisers such as this: // postion = (ScannerPosition) { row, line }, MSFT won't have any of it and // since we want to be cross-platform we have to go through hoops here ... #define SET_POSITION(pos,_row,_line) \ pos.row = _row; \ pos.line = _line; // --------------------------------------------------------------------------- // Human readable names of tokens // --------------------------------------------------------------------------- typedef char token_name_str[MAX_IDENT_LEN]; static token_name_str ScannerTokenName[] = { "", "comment", "start of section header", "start of embedded file header", "end of header", "identifier", "assign operator", "plain value", "encoding prefix", "quoted value", "value in angular brackets", "value separator", "argument list in parentheses", "end-of-line marker", "end-of-file marker" }; // =========================================================================== // P R I V A T E F U N C T I O N S // =========================================================================== // --------------------------------------------------------------------------- // private function: process_include_directive() // --------------------------------------------------------------------------- // static FILE *open_include_file(const char *path) { FILE *file = NULL; if ((scanner.includeLevel >= MAX_INCLUDE_LEVEL) || (fileDoesNotExistAtPath(path))) { return NULL; } // end if // TO DO : check for loops file = fopen(path, "r"); if (file != NULL) { // save current sourcefile parameters scanner.includeStack[scanner.includeLevel].path = scanner.filename; scanner.includeStack[scanner.includeLevel].file = scanner.sourcefile; scanner.includeStack[scanner.includeLevel].position = scanner.currentPos; // initialise parameters for include file scanner.filename = (CHAR *) path; scanner.sourcefile = file; SET_POSITION(scanner.currentPos, 0, 1); // scanner.currentPos = (ScannerPosition) { 0, 1 }; scanner.includeLevel++; } // end if return file; } // end process_include_directive // --------------------------------------------------------------------------- // private function: close_include_file() // --------------------------------------------------------------------------- // static int close_include_file() { if (scanner.includeLevel > 0) { scanner.includeLevel--; scanner.filename = scanner.includeStack[scanner.includeLevel].path; free(scanner.includeStack[scanner.includeLevel + 1].path); scanner.includeStack[scanner.includeLevel + 1].path = NULL; scanner.sourcefile = scanner.includeStack[scanner.includeLevel].file; fclose(scanner.includeStack[scanner.includeLevel + 1].file); scanner.includeStack[scanner.includeLevel + 1].file = NULL; scanner.currentPos = scanner.includeStack[scanner.includeLevel].position; SET_POSITION(scanner.includeStack[scanner.includeLevel + 1].position, 0, 0) // scanner.includeStack[scanner.includeLevel + 1].position = (ScannerPosition) { 0, 0 }; return 1; } else { return 0; } // end if } // end close_include_file // --------------------------------------------------------------------------- // forward declaration of function: process_directive() // --------------------------------------------------------------------------- static int process_directive(); /* FORWARD */ // --------------------------------------------------------------------------- // private function: readchar() // --------------------------------------------------------------------------- // // Reads one character from sourcefile and returns it. Global variable coloumn // is incremented. Returns the linefeed character (LF) if LF or carriage // return (CR) or CRLF is read. If LF is returned, global variable coloumn // will be reset to 0 and lineCounter will be incremented. static CHAR readchar() { register int c; // read one character from source file c = getc(scanner.sourcefile); // transparently handle directives if ((scanner.currentPos.row == 1) && (c == NUMBER_SIGN)) { scanner.currentPos.row++; process_directive(); c = getc(scanner.sourcefile); } // end if // handle LF style end-of-line if (c == ASCII_LF) { scanner.currentPos.row = 1; scanner.currentPos.line++; scanner.lefthandSide = true; } // handle CRLF and CR style end-of-line else if (c == ASCII_CR) { scanner.currentPos.row = 1; scanner.currentPos.line++; scanner.lefthandSide = true; c = getc(scanner.sourcefile); if (c != NEWLINE) { ungetc(c, scanner.sourcefile); } // end if c = NEWLINE; } // handle end-of-file else if (c == EOF) { // set end-of-file flag if end-of-file reached if (scanner.includeLevel > 0) { close_include_file(); return readchar(); } else { scanner.end_of_file = (feof(scanner.sourcefile) == true); c = 0; } // end if } // for any other characters, simply increment row counter else { scanner.currentPos.row++; } // end if if (((CHAR)c == 255) || (c == 0)) { printf(""); } // end if // return character return (CHAR)c; } // end readchar // --------------------------------------------------------------------------- // private function: nextchar() // --------------------------------------------------------------------------- // // Returns the next character in sourcefile without incrementing the file // pointer and without changing global variables coloumn and lineCounter. static CHAR nextchar() { register int status; register int c; c = getc(scanner.sourcefile); status = ungetc(c, scanner.sourcefile); if (status != EOF) { scanner.end_of_file = false; } else { scanner.end_of_file = true; c = 0; } // end if return (CHAR)c; } // end nextchar // --------------------------------------------------------------------------- // private macros: EOF_REACHED and NOT_EOF // --------------------------------------------------------------------------- // #define EOF_REACHED (scanner.end_of_file == true) #define NOT_EOF (scanner.end_of_file == false) // --------------------------------------------------------------------------- // private function: skip_remainder_of_line() // --------------------------------------------------------------------------- // static CHAR skip_remainder_of_line() { register CHAR ch; // take a peek at the next character ch = nextchar(); // skip all characters until end-of-line of end-of-file is reached while ((ch != EOL) && (NOT_EOF)) { // skip the current character ch = readchar(); // take a peek at the next one ch = nextchar(); } // end while; return ch; } // end skip_remainder_of_line // --------------------------------------------------------------------------- // private function: skip_whitespace_and_tabs() // --------------------------------------------------------------------------- // static CHAR skip_whitespace_and_tabs() { register CHAR ch; // take a peek at the next character ch = nextchar(); // skip all whitespace and tab characters while ((ch == WHITESPACE) || (ch == TAB)) { // skip the current character ch = readchar(); // take a peek at the next one ch = nextchar(); } // end while; return ch; } // end skip_whitespace_and_tabs // --------------------------------------------------------------------------- // private function: skip_whitespace_and_control_chars() // --------------------------------------------------------------------------- // static CHAR skip_whitespace_and_control_chars() { register CHAR ch; // take a peek at the next character ch = nextchar(); // skip all whitespace and control characters but not end-of-line while ((ch == WHITESPACE) || ((ch != EOL) && (NOT_EOF) && (IS_CONTROL(ch)))) { // skip the current character ch = readchar(); // take a peek at the next one ch = nextchar(); } // end while; return ch; } // end skip_whitespace_and_control_chars // --------------------------------------------------------------------------- // private function: get_path() // --------------------------------------------------------------------------- // static char *get_path() { register bool quoted = false; register CARDINAL index = 0; register CHAR ch; char *path = NULL; path = malloc(MAX_PATHNAME_LENGTH + 1); if (path == NULL) { return NULL; } // end if // skip leading whitespace and tabs ch = skip_whitespace_and_control_chars(); // skip leading quote if (ch == DOUBLE_QUOTE) { ch = readchar(); ch = nextchar(); quoted = true; } // end if // read the pathname while ((IS_ALPHANUM(ch)) || (ch == UNDERSCORE) || (ch == HYPHEN) || (ch == DOT) || (ch == FILESYSTEM_DIRECTORY_SEPARATOR) || ((quoted == true) && (ch == WHITESPACE))) { ch = readchar(); if (index < MAX_PATHNAME_LENGTH) { path[index] = ch; index++; } // end if ch = nextchar(); } // end while // terminate the string path[index] = CSTRING_TERMINATOR; return path; } // end get_path // --------------------------------------------------------------------------- // private function: get_bool() // --------------------------------------------------------------------------- // static CARDINAL get_bool() { register CHAR ch; register CARDINAL hash = 0; register CARDINAL index = 0; // skip leading whitespace and tabs ch = skip_whitespace_and_control_chars(); if (ch == DIGIT_ONE) { ch = readchar(); ch = nextchar(); if ((ch == WHITESPACE) || (IS_CONTROL(ch)) || (EOF_REACHED)) { ch = readchar(); return TRUE_VALUE; } // end if return 0; } else if (ch == DIGIT_ZERO) { ch = readchar(); ch = nextchar(); if ((ch == WHITESPACE) || (IS_CONTROL(ch)) || (EOF_REACHED)) { ch = readchar(); return FALSE_VALUE; } // end if return 0; } // end if while (IS_LOWERCASE(ch)) { ch = readchar(); hash = ch + (hash << 6) + (hash << 16) - hash; ch = nextchar(); index++; } // end while hash = (hash & 0x7FFFFFFF); return hash; } // end if // --------------------------------------------------------------------------- // private function: process_directive() // --------------------------------------------------------------------------- // static int process_directive() { register CARDINAL index = 0; register CARDINAL hash = 0; register CHAR ch; char *path = NULL; // take a peek at the next character ch = nextchar(); // if the first character is a digit or underscore ... if ((IS_DIGIT(ch)) || (ch == UNDERSCORE)) { // ... it cannot be a valid directive // skip the remainder of the line skip_remainder_of_line(); // return with failure code return 0; } // end if // read the directive's identifier and calculate its hash value while ((IS_ALPHANUM(ch)) || (ch == UNDERSCORE)) { ch = readchar(); hash = ch + (hash << 6) + (hash << 16) - hash; ch = nextchar(); index++; } // end while hash = (hash & 0x7FFFFFFF); // if the identifier is too long ... if (index > MAX_DIRECTIVE_LEN ) { // ... it cannot be a valid directive // skip the remainder of the line skip_remainder_of_line(); // return with failure code return 0; } // end if // match and process the directive if (hash == INCLUDE_FILE) { // found include file directive // skip if maximum include level reached if (scanner.includeLevel >= MAX_INCLUDE_LEVEL) { skip_remainder_of_line(); // return with failure code return 0; } // end if // get the path of the include file ch = skip_whitespace_and_tabs(); if ((ch != EOL) && (NOT_EOF)) { path = get_path(); skip_remainder_of_line(); if (open_include_file(path) == NULL) { free(path); return 0; } // end if } else { // return with failure code return 0; } // end if } else if (hash == ALLOW_UTF8) { // found utf8 control directive ch = skip_whitespace_and_tabs(); if (IS_BOOLEAN(ch)) { hash = get_bool(); if ((hash == YES_VALUE) || (hash == TRUE_VALUE)) { scanner.allowUTF8 = true; } else if ((hash == NO_VALUE) || (hash == FALSE_VALUE)) { scanner.allowUTF8 = false; } else { skip_remainder_of_line(); // return with failure code return 0; } // end if } // end if } else { // no matches, it is not a valid directive // skip the remainder of the line skip_remainder_of_line(); // return with failure code return 0; } // end if return 1; } // end process_directive // --------------------------------------------------------------------------- // private function: copy_to_lexbuf(*str) // --------------------------------------------------------------------------- // static int copy_to_lexbuf(const char *str) { register CARDINAL index = 0; while ((index < LEX_BUFFER_CAPACITY) && (str[index] != CSTRING_TERMINATOR)) { scanner.lexbuf[index] = (CHAR) str[index]; index++; } // end while scanner.lexbuf[index] = CSTRING_TERMINATOR; return 1; } // end copy_to_lexbuf // --------------------------------------------------------------------------- // private function: append_to_lexbuf(*str) // --------------------------------------------------------------------------- // static int append_to_lexbuf(const char *str) { register CARDINAL target_index = 0; register CARDINAL source_index = 0; while ((target_index < LEX_BUFFER_CAPACITY) && (scanner.lexbuf[target_index] != CSTRING_TERMINATOR)) { target_index++; } // end while while ((target_index < LEX_BUFFER_CAPACITY) && (str[source_index] != CSTRING_TERMINATOR)) { scanner.lexbuf[target_index] = (CHAR) str[source_index]; target_index++; source_index++; } // end while scanner.lexbuf[target_index] = CSTRING_TERMINATOR; return 1; } // end append_to_lexbuf // --------------------------------------------------------------------------- // private function: get_comment() // --------------------------------------------------------------------------- // // pre-conditions: // // the current character must be the semicolon that starts this comment // // post-conditions: // // first MAX_COMMENT_LEN characters (excluding any illegal characters) // have been copied to the lexeme buffer and the buffer content has been // terminated with a C string terminator. // // the current character is the last character before EOL or EOF // static int get_comment() { register CARDINAL index = 0; register CHAR ch = readchar(); // remember the position scanner.lastSymPos = scanner.currentPos; // take a peek at the next character ch = nextchar(); // skip any leading control characters including whitespace and tab ch = skip_whitespace_and_control_chars(); // read all characters until end of line or end of file is reached // or the maximum comment length is reached or the lexeme buffer is full while ((ch != EOL) && (NOT_EOF) && (index < MAX_COMMENT_LEN)) { // read the current character ch = readchar(); // copy it to lexeme buffer unless it is a control character other than tab if ((IS_NOT_CONTROL(ch)) || (ch == TAB)) { scanner.lexbuf[index] = ch; index++; } // end if // take a peek at the next character ch = nextchar(); } // end while // skip any characters until end of line or end of file is reached if (index == MAX_COMMENT_LEN) { skip_remainder_of_line(); } // end if // remove any trailing whitespace and tabs while ((index > 0) && (IS_WHITESPACE_OR_TAB(scanner.lexbuf[index-1]))) { index--; } // end while // if we are not at the end of the lexeme buffer if (index < LEX_BUFFER_CAPACITY) { // terminate the string in the buffer scanner.lexbuf[index] = CSTRING_TERMINATOR; } // end if return 1; } // end get_comment // --------------------------------------------------------------------------- // private function: get_ident() // --------------------------------------------------------------------------- // // pre-conditions: // // the current character must be the character immediately before this symbol // // post-conditions: // static int get_ident() { register CARDINAL index = 0; register CARDINAL hash = 0; register CHAR ch; ch = readchar(); // copy to lexeme buffer scanner.lexbuf[index] = ch; hash = ch + (hash << 6) + (hash << 16) - hash; index++; // take a peek at the next character ch = nextchar(); // read the remainder of the identifier while (((IS_ALPHANUM(ch)) || (ch == UNDERSCORE) || (ch == HYPHEN) || (ch == DOT)) && (index < MAX_IDENT_LEN)) { // read the current character ch = readchar(); // copy it to lexeme buffer scanner.lexbuf[index] = ch; hash = ch + (hash << 6) + (hash << 16) - hash; index++; // take a peek at the next character ch = nextchar(); } // end while // terminate the string in the lexeme buffer scanner.lexbuf[index] = CSTRING_TERMINATOR; // remember the hash value scanner.lastIdentHash = (hash & 0x7FFFFFFF); // Who was the moron to decide to use colon and ampersand as // exceptional value separators just for inkeys and regexten? // Pal, if it was you and you are reading this, please get // an education before you do any more harm. Get the Dragon book ... // "Compilers" by Aho/Sethi/Ullman, ISBN: 0201100886, Addison Wesley // The callerid syntax isn't all too smart either. // Ever heard of context free grammars? Guess not. if (scanner.lastIdentHash == CALLERID_KEY) { scanner.allowBracketValue = true; // needs reset at EOL } // The value separators used anywhere else are comma and vertical bar, // not so for parameter "inkeys" where colon is used. This is folly. else if (scanner.lastIdentHash == INKEYS_KEY) { scanner.colonValSep = true; // needs reset at EOL } // The value separators used anywhere else are comma and vertical bar, // not so for parameter "regexten" where ampersand is used. Folly again. else if (scanner.lastIdentHash == REGEXTEN_KEY) { scanner.ampValSep = true; // needs reset at EOL } // end if return 1; } // end get_ident // --------------------------------------------------------------------------- // private function: get_plain_str() // --------------------------------------------------------------------------- // // pre-conditions: // // the current character must be the character immediately before this symbol // // post-conditions: // static int get_plain_str() { register CARDINAL index = 0; register CHAR ch; ch = readchar(); // copy to lexeme buffer scanner.lexbuf[index] = ch; index++; // take a peek at the next character ch = nextchar(); // read all characters until end of line or end of file is reached // or the maximum value length is reached or the lexeme buffer is full while ((IS_7BIT_ASCII(ch)) && (ch != EOL) && (NOT_EOF) && (ch != COMMA) && (ch != VERTICAL_BAR) && (ch != OPENING_PAREN) && (ch != SEMICOLON) && (index < MAX_PLAIN_VALUE_LEN)) { // deal with folly syntax exceptions for inkeys and regexten if ((scanner.ampValSep == true) && (ch == AMPERSAND)) { break; } else if ((scanner.colonValSep == true) && (ch == COLON)) { break; } // end if // read the current character ch = readchar(); // copy it to lexeme buffer unless it is a control character other than tab if ((IS_NOT_CONTROL(ch)) || (ch == TAB)) { scanner.lexbuf[index] = ch; index++; } // end if // take a peek at the next character ch = nextchar(); } // end while // skip any characters until end of line or end of file is reached if (index == MAX_PLAIN_VALUE_LEN) { skip_remainder_of_line(); } // end if // remove any trailing whitespace and tabs while ((index > 0) && (IS_WHITESPACE_OR_TAB(scanner.lexbuf[index-1]))) { index--; } // end while // if we are not at the end of the lexeme buffer if (index < LEX_BUFFER_CAPACITY) { // terminate the string in the buffer scanner.lexbuf[index] = CSTRING_TERMINATOR; } // end if return 1; } // end get_plain_str // --------------------------------------------------------------------------- // private function: get_utf8_multi_byte_sequence(seq) // --------------------------------------------------------------------------- // // pre-conditions: // // the current character must be the lead byte of a UTF8 sequence // // post-conditions: // // seq contains the UTF8 sequence, seq.length contains its length // // the file pointer is moved to the first character after the UTF8 sequence // // return value is 1 if the UTF8 sequence is valid, otherwise 0. // typedef struct { CARDINAL length; CHAR byte[4]; } UTF8Sequence; static int get_utf8_multi_byte_sequence(UTF8Sequence *seq) { register CARDINAL expected_length, index = 1; register CHAR lead_byte, ch; // read the lead byte lead_byte = readchar(); seq->byte[0] = lead_byte; seq->length = 1; // determine the length expected_length = UTF8_LENGTH(lead_byte); // take a peek at the next character ch = nextchar(); // if there are remaining bytes if (expected_length > 1) { // validate 2nd byte in UTF8 sequence if (IS_LEGAL_UTF8_2ND_BYTE(lead_byte, ch)) { // it's valid, copy it to seq ch = readchar(); seq->byte[1] = ch; seq->length++; index++; ch = nextchar(); } // invalid 2nd byte in UTF8 sequence else { // skip the remaining bytes while ((IS_NOT_7BIT_ASCII(ch)) && (index < (expected_length + 1))) { ch = readchar(); index++; } // end while } // end if } // end if // if there are further remaining bytes while ((IS_NOT_7BIT_ASCII(ch)) && (index < (expected_length + 1))) { ch = readchar(); // validate tail byte in UTF8 sequence if (IS_LEGAL_UTF8_TAIL_BYTE(ch)) { // it's valid, copy it to seq seq->byte[index] = ch; seq->length++; index++; } // exit if illegal tail byte is found else { break; } // end if // take a peek at the next character ch = nextchar(); } // end while if (seq->length == expected_length) { // UTF8 sequence is valid return 1; } else { // UTF8 sequence is invalid return 0; } // end if } // end get_utf8_multi_byte_sequence // --------------------------------------------------------------------------- // private function: get_quoted_utf8_str() // --------------------------------------------------------------------------- // // pre-conditions: // // the current character must be the quotation mark starting this symbol // // post-conditions: // static int get_quoted_utf8_str() { UTF8Sequence utf8_sequence; CARDINAL source_index; register CARDINAL index = 0; register CHAR ch; // get the current character ch = nextchar(); // read the remainder of the value while ((ch != DOUBLE_QUOTE) && (ch != EOL) && (NOT_EOF)) { // read the current character ch = readchar(); // check for UTF8 multi-byte sequence if (IS_NOT_7BIT_ASCII(ch)) { // validate and copy UTF8 sequence to lexeme buffer // but only if the entire sequence fits into the buffer if (get_utf8_multi_byte_sequence(&utf8_sequence) == 1) { // UTF8 sequence is valid if ((index + utf8_sequence.length) < MAX_QUOTED_VALUE_LEN) { // UTF8 sequence fits into the lexeme buffer, copy it for (source_index = 0; source_index < utf8_sequence.length; source_index++) { scanner.lexbuf[index] = utf8_sequence.byte[source_index]; } // end for } // UTF8 sequence is valid but doesn't fit into the lexeme buffer else { // update flag to indicate excess characters were ignored scanner.excess_chars_ignored = true; } // end if } // UTF8 sequence is invalid else { // update flag to indicate illegal characters were skipped scanner.illegal_chars_skipped = true; } // end if } // check for non-control 7-bit ASCII codes else if (IS_NOT_CONTROL(ch) || (ch == TAB)) { // check if this is a backslash escaped double quote if ((ch == BACKSLASH) && (nextchar() == DOUBLE_QUOTE)) { // skip the backslash and fetch the double quote ch = readchar(); } // end if // copy to lexeme buffer unless buffer is full if (index < MAX_QUOTED_VALUE_LEN) { scanner.lexbuf[index] = ch; index++; } // otherwise, buffer is full else { // update flag to indicate excess characters were ignored scanner.excess_chars_ignored = true; } // end if } // otherwise, this is an illegal control character else { // update flag to indicate illegal characters were skipped scanner.illegal_chars_skipped = true; } // end if // take a peek at the next character ch = nextchar(); } // end while // terminate the string in the buffer scanner.lexbuf[index] = CSTRING_TERMINATOR; // check if properly delimited if (ch == DOUBLE_QUOTE) { // read the last character ch = readchar(); } // otherwise, it is EOL or EOF terminated else { scanner.auto_terminated = true; } // end while return 1; } // end get_quoted_utf8_str // --------------------------------------------------------------------------- // private function: get_bracketed_str() // --------------------------------------------------------------------------- // // pre-conditions: // // the current character must be the character immediately before this symbol // // post-conditions: // static int get_bracketed_str() { register CARDINAL index = 0; register CHAR ch; ch = readchar(); // copy to lexeme buffer scanner.lexbuf[index] = ch; index++; // take a peek at the next character ch = nextchar(); // skip any leading whitespace and tabs ch = skip_whitespace_and_tabs(); // first character after opening bracket may be plus sign if (ch == PLUS) { ch = readchar(); // copy to lexeme buffer scanner.lexbuf[index] = ch; index++; // take a peek at the next character ch = nextchar(); } // end if // read the remainder of the value while (((IS_DIGIT(ch)) || (ch == WHITESPACE) || (ch == OPENING_PAREN) || (ch == CLOSING_PAREN) || (ch == HYPHEN)) && (index < MAX_BRACKETED_VALUE_LEN)) { // read the current character ch = readchar(); // copy it to lexeme buffer unless it is a control character scanner.lexbuf[index] = ch; index++; // take a peek at the next character ch = nextchar(); } // end while // skip any characters until closing angular bracket, end of line or end of file is reached if (index == MAX_BRACKETED_VALUE_LEN) { while ((ch != CLOSING_ANGULAR_BRACKET) && (ch != EOL) && (NOT_EOF)) { ch = readchar(); ch = nextchar(); } // end while } // end if // check if properly delimited if (ch == CLOSING_ANGULAR_BRACKET) { // remove any trailing whitespace while ((index > 0) && (ch == WHITESPACE)) { index--; } // end while // read the last character ch = readchar(); // copy it to lexeme buffer scanner.lexbuf[index] = ch; index++; // if we are not at the end of the lexeme buffer if (index < LEX_BUFFER_CAPACITY) { // terminate the string in the buffer scanner.lexbuf[index] = CSTRING_TERMINATOR; } // end if } // otherwise, report error else { // illegal character // return with failure return 0; } // end if return 1; } // end get_bracketed_str // --------------------------------------------------------------------------- // private function: get_arglist_str() // --------------------------------------------------------------------------- // // pre-conditions: // // the current character must be the character immediately before this symbol // // post-conditions: // static int get_arglist_str() { register CARDINAL paren_level = 1; register CARDINAL index = 0; register CHAR ch; ch = readchar(); // copy to lexeme buffer scanner.lexbuf[index] = ch; index++; // take a peek at the next character ch = nextchar(); // skip any leading whitespace and tabs ch = skip_whitespace_and_tabs(); // read the remainder of the value while ((paren_level > 0) && (ch != EOL) && (NOT_EOF) && (index < MAX_ARGUMENT_LIST_LEN)) { // read the current character ch = readchar(); // copy it to lexeme buffer scanner.lexbuf[index] = ch; index++; // keep track of nested parenthesises if (ch == OPENING_PAREN) { paren_level++; } else if (ch == CLOSING_PAREN) { paren_level--; // if this is the final closing parenthesis ... if (paren_level == 0) { // ... remove any trailing whitespace or tabs while ((index > 0) && (IS_WHITESPACE_OR_TAB(ch))) { index--; } // end while } // end if } // end if // take a peek at the next character ch = nextchar(); } // end while // if we exceeded maximum length ... if (index == MAX_ARGUMENT_LIST_LEN) { // ... skip any characters until // all open parenthesises are closed or end of line or end of file is reached while ((paren_level > 0) && (ch != EOL) && (NOT_EOF)) { ch = readchar(); // keep track of nested parenthesises if (ch == OPENING_PAREN) { paren_level++; } else if (ch == CLOSING_PAREN) { paren_level--; } // end if ch = nextchar(); } // end while } // end if // if we are not at the end of the lexeme buffer if (index < LEX_BUFFER_CAPACITY) { // terminate the string in the buffer scanner.lexbuf[index] = CSTRING_TERMINATOR; } // end if // check if properly delimited if (paren_level != 0) { // TO DO: error or warning } // end if return 1; } // end get_arglist_str // --------------------------------------------------------------------------- // private function: get_encoding_pfx() // --------------------------------------------------------------------------- // // pre-conditions: // // the current character must be the back quote starting this symbol // // post-conditions: // static int get_encoding_pfx() { register CARDINAL index = 0; register CHAR ch; // skip the back quote ch = readchar(); ch = nextchar(); // skip any leading whitespace and tabs ch = skip_whitespace_and_tabs(); // read the first significant character if (IS_LETTER(ch)) { ch = readchar(); // copy to lexeme buffer scanner.lexbuf[index] = ch; index++; // take a peek at the next character ch = nextchar(); } else { // illegal character // TO DO // return with failure return 0; } // end if // read the remainder of the identifier while (((IS_ALPHANUM(ch)) || (ch == UNDERSCORE) || (ch == HYPHEN)) && (index < MAX_IDENT_LEN)) { // read the current character ch = readchar(); // copy it to lexeme buffer scanner.lexbuf[index] = ch; index++; // take a peek at the next character ch = nextchar(); } // end while // terminate the string in the lexeme buffer scanner.lexbuf[index] = CSTRING_TERMINATOR; return 1; } // end get_encoding_pfx // =========================================================================== // P U B L I C F U N C T I O N S // =========================================================================== // --------------------------------------------------------------------------- // function: init_scanner(infile) // --------------------------------------------------------------------------- // // Initialises the scanner and opens the sourcefile for reading. ScannerStatus init_scanner(const char *infile) { int error = 0; // do nothing if already initialised if (scanner.initialized == true) { return SCANNER_STATUS_ALREADY_INITIALIZED; } // end if // open the source file scanner.sourcefile = fopen(infile, "r"); // if unsuccessful, get POSIX error code and set status if (scanner.sourcefile == NULL) { error = errno; if ((error == ENOENT) || (error == ENOTDIR)) { return SCANNER_STATUS_FILE_NOT_FOUND; } else if (error == EACCES) { return SCANNER_STATUS_FILE_ACCESS_DENIED; } else if ((error == EMFILE) || (error == ENFILE)) { return SCANNER_STATUS_OPEN_FILE_LIMIT_REACHED; } else if (error == ENAMETOOLONG) { return SCANNER_STATUS_PATH_NAME_TOO_LONG; } else if (error == ENOMEM) { return SCANNER_STATUS_OUT_OF_MEMORY; } else if (error == ELOOP) { return SCANNER_STATUS_LOOP_IN_PATHNAME; } else { return SCANNER_STATUS_ERROR_OPENING_FILE; } // end if } // end if // allocate memory for lexeme buffer scanner.lexbuf = (CHAR *) malloc(LEX_BUFFER_CAPACITY); // if successful ... if (scanner.lexbuf != NULL) { // ... initialise with empty string scanner.lexbuf[0] = CSTRING_TERMINATOR; } // otherwise ... else { error = errno; if (error == ENOMEM) { fclose(scanner.sourcefile); return SCANNER_STATUS_OUT_OF_MEMORY; } // end if } // end if // initialise flags and counters scanner.end_of_file = false; scanner.lefthandSide = true; scanner.allowUTF8 = true; scanner.ampValSep = false; scanner.colonValSep = false; scanner.allowBracketValue = false; scanner.filename = (CHAR *) infile; SET_POSITION(scanner.currentPos, 0, 1); scanner.lastSym = NO_TOKEN; scanner.lastIdentHash = 0; SET_POSITION(scanner.lastSymPos, 0, 1); scanner.lastStatus = SCANNER_STATUS_SUCCESS; scanner.illegal_chars_skipped = false; scanner.excess_chars_ignored = false; scanner.auto_terminated = false; scanner.initialized = true; return SCANNER_STATUS_SUCCESS; } // end init_scanner // --------------------------------------------------------------------------- // function: scanner_initialized() // --------------------------------------------------------------------------- // // Returns the intialisation status of the scanner, true or false. bool scanner_initialized() { return (scanner.initialized = true); } // end scanner_initialized // --------------------------------------------------------------------------- // function: getsym() // --------------------------------------------------------------------------- // // Reads the next symbol from sourcefile and returns its token. ScannerToken getsym() { CHAR ch; ScannerToken token; // exit if not initialised if (scanner.initialized == false) { scanner.lastStatus = SCANNER_STATUS_NOT_INITIALIZED; return NO_TOKEN; } // end if // take a peek at the next character ch = nextchar(); // skip whitespace and tabs ch = skip_whitespace_and_tabs(); // remember the position scanner.lastSymPos = scanner.currentPos; // reset indicators scanner.illegal_chars_skipped = false; scanner.excess_chars_ignored = false; scanner.auto_terminated = false; // process lefthand side symbols if (scanner.lefthandSide == true) { // first significant character on current line if (ch == SEMICOLON) { // found comment delimiter get_comment(); token = COMMENT; } else if ((IS_LETTER(ch)) || (ch == UNDERSCORE)) { // found identifier get_ident(); token = IDENTIFIER; } else if (ch == EQUAL_SIGN) { // found assignment operator ch = readchar(); copy_to_lexbuf("="); // look at next character ch = nextchar(); if (ch == CLOSING_ANGULAR_BRACKET) { // read and skip it ch = readchar(); append_to_lexbuf(">"); } // end if // anything that follows is right hand side scanner.lefthandSide = false; token = ASSIGN_OPERATOR; } else if (ch == OPENING_BRACKET) { // found start of section header ch = readchar(); copy_to_lexbuf("["); // look at next character ch = nextchar(); if (ch == PLUS) { // read and skip it ch = readchar(); append_to_lexbuf("+"); token = START_OF_VFILE_HEADER; } else { token = START_OF_SECTION_HEADER; } // end if } else if (ch == CLOSING_BRACKET) { // found end of section header ch = readchar(); copy_to_lexbuf("]"); token = END_OF_HEADER; } else if (ch == EOL) { // found end of line ch = readchar(); copy_to_lexbuf("\n"); scanner.ampValSep = false; scanner.colonValSep = false; scanner.allowBracketValue = false; token = EOL_MARK; } else if (EOF_REACHED) { // found end of file ch = readchar(); scanner.lexbuf[0] = CSTRING_TERMINATOR; token = EOF_MARK; } else { // illegal character ch = readchar(); scanner.lexbuf[0] = ch; scanner.lexbuf[1] = CSTRING_TERMINATOR; scanner.lastStatus = SCANNER_STATUS_ILLEGAL_CHARACTER; token = NO_TOKEN; } // end if } // process right hand side symbols else { if (ch == DOUBLE_QUOTE) { // quoted string value ch = readchar(); get_quoted_utf8_str(); token = QUOTED_VALUE; } else if (ch == OPENING_PAREN) { // argument list get_arglist_str(); token = ARGUMENT_LIST; } else if ((scanner.allowBracketValue == true) && (ch == OPENING_ANGULAR_BRACKET)) { // quoted string value get_bracketed_str(); token = BRACKETED_VALUE; } else if (ch == BACK_QUOTE) { // encoding prefix ch = readchar(); get_encoding_pfx(); token = ENCODING_PREFIX; } else if (ch == COMMA) { // value delimiter ch = readchar(); copy_to_lexbuf(","); token = VALUE_SEPARATOR; } else if (ch == VERTICAL_BAR) { // value delimiter ch = readchar(); copy_to_lexbuf("|"); token = VALUE_SEPARATOR; } else if (ch == SEMICOLON) { // found comment delimiter ch = readchar(); get_comment(); token = COMMENT; } else if ((scanner.ampValSep == true) && (ch == AMPERSAND)) { // found ampersand, treat as value separator ch = readchar(); copy_to_lexbuf("&"); token = VALUE_SEPARATOR; } else if ((scanner.colonValSep == true) && (ch == COLON)) { // found colon, treat as value separator ch = readchar(); copy_to_lexbuf(":"); token = VALUE_SEPARATOR; } else if (ch == EOL) { // found end of line ch = readchar(); copy_to_lexbuf("\n"); scanner.ampValSep = false; scanner.colonValSep = false; scanner.allowBracketValue = false; token = EOL_MARK; } else if (EOF_REACHED) { // found end of file ch = readchar(); scanner.lexbuf[0] = CSTRING_TERMINATOR; token = EOF_MARK; } else if (IS_CONTROL(ch)) { // illegal character ch = readchar(); scanner.lexbuf[0] = ch; scanner.lexbuf[1] = CSTRING_TERMINATOR; scanner.lastStatus = SCANNER_STATUS_ILLEGAL_CHARACTER; token = NO_TOKEN; } else { // plain value get_plain_str(); token = PLAIN_VALUE; } // end if } // end if if (token != NO_TOKEN) { scanner.lastSym = token; scanner.lastStatus = SCANNER_STATUS_SUCCESS; } // end if return token; } // end getsym // --------------------------------------------------------------------------- // function: lookahead_sym() // --------------------------------------------------------------------------- // // Looks ahead one symbol in the sourcefile and returns its token. Subsequent // calls to this function will return the same symbol. Subsequent calls to // getpos() and getlex() will return the respective values for the current // symbol and not for the lookahead symbol. ScannerToken lookahead_sym() { ScannerToken sym; // Scanner saved_state; // TO DO sym = getsym(); return sym; } // end lookahead_sym // --------------------------------------------------------------------------- // function: getpos() // --------------------------------------------------------------------------- // // Returns the most previously read symbol's position in the sourcefile. ScannerPosition getpos() { if (scanner.initialized == false) { ScannerPosition nullpos; SET_POSITION(nullpos, 0, 0); return nullpos; } // end if return scanner.lastSymPos; } // end getpos // --------------------------------------------------------------------------- // function: get_filename() // --------------------------------------------------------------------------- // // Returns the name of the most previously read symbol's sourcefile. const char *get_filename() { return (char *) scanner.filename; } // end getpos // --------------------------------------------------------------------------- // function: gethash() // --------------------------------------------------------------------------- // // Returns the hash value of the most previously read symbol if the symbol is // an identifier, otherwise zero. // unsigned int gethash() { // exit if not initialised if ((scanner.initialized == false) || (scanner.lastSym != IDENTIFIER)) { return 0; } // end if return scanner.lastIdentHash; } // end gethash // --------------------------------------------------------------------------- // function: getlex(*str) // --------------------------------------------------------------------------- // // Passes the most previously read symbol's lexical representation (lexeme) // in str. const char *getlex() { // exit if not initialised if (scanner.initialized == false) { return NULL; } // end if return (char *) scanner.lexbuf; } // end getlex // --------------------------------------------------------------------------- // function: getstat() // --------------------------------------------------------------------------- // // Returns the status of the most recent operation. ScannerStatus getstat() { if (scanner.initialized == false) { return SCANNER_STATUS_NOT_INITIALIZED; } else { return scanner.lastStatus; } // end if } // end getstat // --------------------------------------------------------------------------- // function: illegal_characters_skipped() // --------------------------------------------------------------------------- // // Returns true if illegal characters were skipped while last symbol was read, // otherwise false. bool illegal_characters_skipped() { return scanner.illegal_chars_skipped; } // end illegal_characters_skipped // --------------------------------------------------------------------------- // function: excess_characters_ignored() // --------------------------------------------------------------------------- // // Returns true if excess characters were ignored while last symbol was read, // otherwise false. bool excess_characters_ignored() { return scanner.excess_chars_ignored; } // end excess_characters_ignored // --------------------------------------------------------------------------- // function: auto_terminated() // --------------------------------------------------------------------------- // // Returns true if a closing delimiter was missing while last symbol was read, // otherwise false. bool auto_terminated() { return scanner.auto_terminated; } // end auto_terminated // --------------------------------------------------------------------------- // function: max_len_of(sym) // --------------------------------------------------------------------------- // // Returns the maximum allowable length for symbol sym. unsigned int max_len_of(const ScannerToken sym) { switch(sym) { case COMMENT: return MAX_COMMENT_LEN; case START_OF_SECTION_HEADER : return 1; case START_OF_VFILE_HEADER : return 2; case END_OF_HEADER : return 1; case IDENTIFIER : return MAX_IDENT_LEN; case ASSIGN_OPERATOR: return 2; case PLAIN_VALUE : return MAX_PLAIN_VALUE_LEN; case ENCODING_PREFIX : return MAX_IDENT_LEN; case QUOTED_VALUE : return MAX_QUOTED_VALUE_LEN; case BRACKETED_VALUE : return MAX_BRACKETED_VALUE_LEN; case VALUE_SEPARATOR: return 1; case ARGUMENT_LIST: return MAX_ARGUMENT_LIST_LEN; case EOL_MARK : return 1; case EOF_MARK : return 0; default : return 0; } // end switch } // end max_symlen // --------------------------------------------------------------------------- // function: size_of_lexbuf() // --------------------------------------------------------------------------- // // Returns the size of the lexeme buffer. unsigned int size_of_lexbuf() { return LEX_BUFFER_CAPACITY; } // end size_of_lexbuf // --------------------------------------------------------------------------- // function: token_name(sym) // --------------------------------------------------------------------------- // // Returns a human readable name for token sym. const char *token_name(const ScannerToken sym) { if ((sym >= NO_TOKEN) && (sym <= EOF_MARK)) { return (char *) ScannerTokenName[sym]; } else { return (char *) ScannerTokenName[NO_TOKEN]; } // end if } // end token_name // --------------------------------------------------------------------------- // function: reset_scanner() // --------------------------------------------------------------------------- // // Resets the scanner to pre-initialisation status and closes the sourcefile. ScannerStatus reset_scanner() { // exit if not initialised if (scanner.initialized == false) { return SCANNER_STATUS_NOT_INITIALIZED; } // end if // reset initialisation status scanner.initialized = false; // release lexeme buffer free(scanner.lexbuf); scanner.lexbuf = NULL; // close open include files while (scanner.includeLevel > 0) { close_include_file(); } // end while // close toplevel source file fclose(scanner.sourcefile); scanner.sourcefile = NULL; // free(scanner.filename); scanner.filename = NULL; // reset flags and counters scanner.end_of_file = false; scanner.lefthandSide = true; scanner.allowUTF8 = true; scanner.ampValSep = false; scanner.allowBracketValue = false; scanner.colonValSep = false; scanner.includeLevel = 0; SET_POSITION(scanner.currentPos, 0, 1); scanner.lastSym = NO_TOKEN; scanner.lastIdentHash = 0; SET_POSITION(scanner.lastSymPos, 0, 1); scanner.lastStatus = SCANNER_STATUS_SUCCESS; scanner.illegal_chars_skipped = false; scanner.excess_chars_ignored = false; scanner.auto_terminated = false; return scanner.lastStatus; } // end reset_scanner // END OF FILE