diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/Components/OptionsArray.php | 4 | ||||
-rw-r--r-- | src/Context.php | 4 | ||||
-rw-r--r-- | src/Lexer.php | 32 | ||||
-rw-r--r-- | src/UtfString.php | 43 |
4 files changed, 40 insertions, 43 deletions
diff --git a/src/Components/OptionsArray.php b/src/Components/OptionsArray.php index a6b612d..487eefd 100644 --- a/src/Components/OptionsArray.php +++ b/src/Components/OptionsArray.php @@ -201,8 +201,8 @@ class OptionsArray extends Component $list, empty($lastOption[2]) ? array() : $lastOption[2] ); - $ret->options[$lastOptionId]['value'] = - $ret->options[$lastOptionId]['expr']->expr; + $ret->options[$lastOptionId]['value'] + = $ret->options[$lastOptionId]['expr']->expr; $lastOption = null; $state = 0; } else { diff --git a/src/Context.php b/src/Context.php index efcb8df..dd912fb 100644 --- a/src/Context.php +++ b/src/Context.php @@ -397,7 +397,9 @@ abstract class Context */ public static function isSeparator($str) { - return !ctype_alnum($str) && $str !== '_'; + // NOTES: Only ASCII characters may be separators. + // `~` is the last printable ASCII character. + return ($str <= '~') && (!ctype_alnum($str)) && ($str !== '_'); } /** diff --git a/src/Lexer.php b/src/Lexer.php index d70641f..6e8c0fd 100644 --- a/src/Lexer.php +++ b/src/Lexer.php @@ -13,6 +13,16 @@ namespace SqlParser; use SqlParser\Exceptions\LexerException; +if (!defined('USE_UTF_STRINGS')) { + + /** + * Forces usage of `UtfString` if the string is multibyte. + * `UtfString` may be slower, but it gives better results. + * @var bool + */ + define('USE_UTF_STRINGS', true); +} + /** * Performs lexical analysis over a SQL statement and splits it in multiple * tokens. @@ -149,11 +159,27 @@ class Lexer */ public function __construct($str, $strict = false) { - $this->str = $str; - $this->len = ($str instanceof UtfString) ? - $str->length() : strlen($str); + // For multi-byte strings, a new instance of `UtfString` is + // initialized (only if `UtfString` usage is forced. + if (!($str instanceof UtfString)) { + $len = strlen($str); + if ((USE_UTF_STRINGS) && ($len != mb_strlen($str))) { + $str = new UtfString($str); + } + } + + if ($str instanceof UtfString) { + $this->str = $str; + $this->len = $str->length(); + } else { + $this->str = $str; + // `strlen` is used instead of `mb_strlen` because the lexer + // needs to parse each byte of the input. + $this->len = $len; + } $this->strict = $strict; + // Setting the delimiter. $this->delimiter = static::$DEFAULT_DELIMITER; $this->lex(); diff --git a/src/UtfString.php b/src/UtfString.php index 27e3f93..1e863a4 100644 --- a/src/UtfString.php +++ b/src/UtfString.php @@ -93,7 +93,7 @@ class UtfString implements \ArrayAccess */ public function offsetExists($offset) { - return $offset < $this->charLen; + return ($offset >= 0) && ($offset < $this->charLen); } /** @@ -190,26 +190,13 @@ class UtfString implements \ArrayAccess return 3; } elseif ($byte < 248) { return 4; - } elseif ($byte === 252) { + } elseif ($byte < 252) { return 5; // unofficial } return 6; // unofficial } /** - * Returns the number of remaining characters. - * - * @return int - */ - public function remaining() - { - if ($this->charIdx < $this->charLen) { - return $this->charLen - $this->charIdx; - } - return 0; - } - - /** * Returns the length in characters of the string. * * @return int @@ -220,30 +207,12 @@ class UtfString implements \ArrayAccess } /** - * Gets the values of the indexes. - * - * @param int &$byte Reference to the byte index. - * @param int &$char Reference to the character index. - * - * @return void - */ - public function getIndexes(&$byte, &$char) - { - $byte = $this->byteIdx; - $char = $this->charIdx; - } - - /** - * Sets the values of the indexes. + * Returns the contained string. * - * @param int $byte The byte index. - * @param int $char The character index. - * - * @return void + * @return strin */ - public function setIndexes($byte = 0, $char = 0) + public function __toString() { - $this->byteIdx = $byte; - $this->charIdx = $char; + return $this->str; } } |