summaryrefslogtreecommitdiffstats
path: root/src/Lexer.php
diff options
context:
space:
mode:
Diffstat (limited to 'src/Lexer.php')
-rw-r--r--src/Lexer.php32
1 files changed, 29 insertions, 3 deletions
diff --git a/src/Lexer.php b/src/Lexer.php
index d70641f..6e8c0fd 100644
--- a/src/Lexer.php
+++ b/src/Lexer.php
@@ -13,6 +13,16 @@ namespace SqlParser;
use SqlParser\Exceptions\LexerException;
+if (!defined('USE_UTF_STRINGS')) {
+
+ /**
+ * Forces usage of `UtfString` if the string is multibyte.
+ * `UtfString` may be slower, but it gives better results.
+ * @var bool
+ */
+ define('USE_UTF_STRINGS', true);
+}
+
/**
* Performs lexical analysis over a SQL statement and splits it in multiple
* tokens.
@@ -149,11 +159,27 @@ class Lexer
*/
public function __construct($str, $strict = false)
{
- $this->str = $str;
- $this->len = ($str instanceof UtfString) ?
- $str->length() : strlen($str);
+ // For multi-byte strings, a new instance of `UtfString` is
+ // initialized (only if `UtfString` usage is forced.
+ if (!($str instanceof UtfString)) {
+ $len = strlen($str);
+ if ((USE_UTF_STRINGS) && ($len != mb_strlen($str))) {
+ $str = new UtfString($str);
+ }
+ }
+
+ if ($str instanceof UtfString) {
+ $this->str = $str;
+ $this->len = $str->length();
+ } else {
+ $this->str = $str;
+ // `strlen` is used instead of `mb_strlen` because the lexer
+ // needs to parse each byte of the input.
+ $this->len = $len;
+ }
$this->strict = $strict;
+ // Setting the delimiter.
$this->delimiter = static::$DEFAULT_DELIMITER;
$this->lex();