Fixed support for multi-byte strings.

author: Dan Ungureanu <udan1107@gmail.com> 2015-07-16 16:45:04 +0300
committer: Dan Ungureanu <udan1107@gmail.com> 2015-07-16 16:45:04 +0300
commit: 9e3a9ee729eaada8585fa8f33ba7d8ac2231495a (patch)
tree: 430342d6b6d83dc8d89e5f55d6875451f687f0ee
parent: 56ce2d7a37a1ed1d6aabf2b7908ae5da43863497 (diff)
download: sql-parser-9e3a9ee729eaada8585fa8f33ba7d8ac2231495a.zip
sql-parser-9e3a9ee729eaada8585fa8f33ba7d8ac2231495a.tar.gz
sql-parser-9e3a9ee729eaada8585fa8f33ba7d8ac2231495a.tar.bz2
9 files changed, 134 insertions, 43 deletions
diff --git a/phpunit.xml b/phpunit.xml
index ea82742..5d34349 100644
--- a/phpunit.xml
+++ b/phpunit.xml
@@ -22,6 +22,9 @@
         <testsuite name="Lexer">
             <directory suffix=".php">./tests/Lexer</directory>
         </testsuite>
+        <testsuite name="Misc">
+            <directory suffix=".php">./tests/Misc</directory>
+        </testsuite>
         <testsuite name="Parser">
             <directory suffix=".php">./tests/Parser</directory>
         </testsuite>
diff --git a/src/Components/OptionsArray.php b/src/Components/OptionsArray.php
index a6b612d..487eefd 100644
--- a/src/Components/OptionsArray.php
+++ b/src/Components/OptionsArray.php
@@ -201,8 +201,8 @@ class OptionsArray extends Component
                         $list,
                         empty($lastOption[2]) ? array() : $lastOption[2]
                     );
-                    $ret->options[$lastOptionId]['value'] =
-                        $ret->options[$lastOptionId]['expr']->expr;
+                    $ret->options[$lastOptionId]['value']
+                        = $ret->options[$lastOptionId]['expr']->expr;
                     $lastOption = null;
                     $state = 0;
                 } else {
diff --git a/src/Context.php b/src/Context.php
index efcb8df..dd912fb 100644
--- a/src/Context.php
+++ b/src/Context.php
@@ -397,7 +397,9 @@ abstract class Context
      */
     public static function isSeparator($str)
     {
-        return !ctype_alnum($str) && $str !== '_';
+        // NOTES:   Only ASCII characters may be separators.
+        //          `~` is the last printable ASCII character.
+        return ($str <= '~') && (!ctype_alnum($str)) && ($str !== '_');
     }
 
     /**
diff --git a/src/Lexer.php b/src/Lexer.php
index d70641f..6e8c0fd 100644
--- a/src/Lexer.php
+++ b/src/Lexer.php
@@ -13,6 +13,16 @@ namespace SqlParser;
 
 use SqlParser\Exceptions\LexerException;
 
+if (!defined('USE_UTF_STRINGS')) {
+
+    /**
+     * Forces usage of `UtfString` if the string is multibyte.
+     * `UtfString` may be slower, but it gives better results.
+     * @var bool
+     */
+    define('USE_UTF_STRINGS', true);
+}
+
 /**
  * Performs lexical analysis over a SQL statement and splits it in multiple
  * tokens.
@@ -149,11 +159,27 @@ class Lexer
      */
     public function __construct($str, $strict = false)
     {
-        $this->str = $str;
-        $this->len = ($str instanceof UtfString) ?
-            $str->length() : strlen($str);
+        // For multi-byte strings, a new instance of `UtfString` is
+        // initialized (only if `UtfString` usage is forced.
+        if (!($str instanceof UtfString)) {
+            $len = strlen($str);
+            if ((USE_UTF_STRINGS) && ($len != mb_strlen($str))) {
+                $str = new UtfString($str);
+            }
+        }
+
+        if ($str instanceof UtfString) {
+            $this->str = $str;
+            $this->len = $str->length();
+        } else {
+            $this->str = $str;
+            // `strlen` is used instead of `mb_strlen` because the lexer
+            // needs to parse each byte of the input.
+            $this->len = $len;
+        }
         $this->strict = $strict;
 
+        // Setting the delimiter.
         $this->delimiter = static::$DEFAULT_DELIMITER;
 
         $this->lex();
diff --git a/src/UtfString.php b/src/UtfString.php
index 27e3f93..1e863a4 100644
--- a/src/UtfString.php
+++ b/src/UtfString.php
@@ -93,7 +93,7 @@ class UtfString implements \ArrayAccess
      */
     public function offsetExists($offset)
     {
-        return $offset < $this->charLen;
+        return ($offset >= 0) && ($offset < $this->charLen);
     }
 
     /**
@@ -190,26 +190,13 @@ class UtfString implements \ArrayAccess
             return 3;
         } elseif ($byte < 248) {
             return 4;
-        } elseif ($byte === 252) {
+        } elseif ($byte < 252) {
             return 5; // unofficial
         }
         return 6; // unofficial
     }
 
     /**
-     * Returns the number of remaining characters.
-     *
-     * @return int
-     */
-    public function remaining()
-    {
-        if ($this->charIdx < $this->charLen) {
-            return $this->charLen - $this->charIdx;
-        }
-        return 0;
-    }
-
-    /**
      * Returns the length in characters of the string.
      *
      * @return int
@@ -220,30 +207,12 @@ class UtfString implements \ArrayAccess
     }
 
     /**
-     * Gets the values of the indexes.
-     *
-     * @param int &$byte Reference to the byte index.
-     * @param int &$char Reference to the character index.
-     *
-     * @return void
-     */
-    public function getIndexes(&$byte, &$char)
-    {
-        $byte = $this->byteIdx;
-        $char = $this->charIdx;
-    }
-
-    /**
-     * Sets the values of the indexes.
+     * Returns the contained string.
      *
-     * @param int $byte The byte index.
-     * @param int $char The character index.
-     *
-     * @return void
+     * @return strin
      */
-    public function setIndexes($byte = 0, $char = 0)
+    public function __toString()
     {
-        $this->byteIdx = $byte;
-        $this->charIdx = $char;
+        return $this->str;
     }
 }
diff --git a/tests/Lexer/LexerTest.php b/tests/Lexer/LexerTest.php
index dabe64d..cd8718c 100644
--- a/tests/Lexer/LexerTest.php
+++ b/tests/Lexer/LexerTest.php
@@ -51,6 +51,7 @@ class LexerTest extends TestCase
     {
         return array(
             array('lex'),
+            array('lexUtf8'),
             array('lexBool'),
             array('lexComment'),
             array('lexDelimiter'),
diff --git a/tests/Misc/UtfStringTest.php b/tests/Misc/UtfStringTest.php
new file mode 100644
index 0000000..0d1ff78
--- /dev/null
+++ b/tests/Misc/UtfStringTest.php
@@ -0,0 +1,88 @@
+<?php
+
+namespace SqlParser\Tests\Misc;
+
+use SqlParser\UtfString;
+
+use SqlParser\Tests\TestCase;
+
+class UtfStringTest extends TestCase
+{
+
+    /**
+     * Sample phrase in French.
+     *
+     * @var UtfString
+     */
+    const TEST_PHRASE = 'Les naïfs ægithales hâtifs pondant à Noël où il ' .
+        'gèle sont sûrs d\'être déçus en voyant leurs drôles d\'œufs abîmés.';
+
+    /**
+     * The length of the sample phrase.
+     *
+     * @var int
+     */
+    const TEST_PHRASE_LEN = 113;
+
+    public function testArrayAccess()
+    {
+        $str = new UtfString(static::TEST_PHRASE);
+
+        // offsetExists
+        $this->assertTrue(isset($str[static::TEST_PHRASE_LEN - 1]));
+        $this->assertFalse(isset($str[-1]));
+        $this->assertFalse(isset($str[static::TEST_PHRASE_LEN]));
+
+        // offsetGet
+        $this->assertEquals('.', $str[static::TEST_PHRASE_LEN - 1]);
+        $this->assertEquals(null, $str[-1]);
+        $this->assertEquals(null, $str[static::TEST_PHRASE_LEN]);
+    }
+
+    /**
+     * @expectedException \Exception
+     * @expectedExceptionMessage Not implemented.
+     */
+    public function testSet()
+    {
+        $str = new UtfString('');
+        $str[0] = 'a';
+    }
+
+    /**
+     * @expectedException \Exception
+     * @expectedExceptionMessage Not implemented.
+     */
+    public function testUnset()
+    {
+        $str = new UtfString('');
+        unset($str[0]);
+    }
+
+    public function testGetCharLength()
+    {
+        $this->assertEquals(1, UtfString::getCharLength(chr(0x00))); // 00000000
+        $this->assertEquals(1, UtfString::getCharLength(chr(0x7F))); // 01111111
+
+        $this->assertEquals(2, UtfString::getCharLength(chr(0xC0))); // 11000000
+        $this->assertEquals(2, UtfString::getCharLength(chr(0xDF))); // 11011111
+
+        $this->assertEquals(3, UtfString::getCharLength(chr(0xE0))); // 11100000
+        $this->assertEquals(3, UtfString::getCharLength(chr(0xEF))); // 11101111
+
+        $this->assertEquals(4, UtfString::getCharLength(chr(0xF0))); // 11110000
+        $this->assertEquals(4, UtfString::getCharLength(chr(0xF7))); // 11110111
+
+        $this->assertEquals(5, UtfString::getCharLength(chr(0xF8))); // 11111000
+        $this->assertEquals(5, UtfString::getCharLength(chr(0xFB))); // 11111011
+
+        $this->assertEquals(6, UtfString::getCharLength(chr(0xFC))); // 11111100
+        $this->assertEquals(6, UtfString::getCharLength(chr(0xFD))); // 11111101
+    }
+
+    public function testToString()
+    {
+        $str = new UtfString(static::TEST_PHRASE);
+        $this->assertEquals(static::TEST_PHRASE, (string) $str);
+    }
+}
diff --git a/tests/data/lexUtf8.in b/tests/data/lexUtf8.in
new file mode 100644
index 0000000..9cf478e
--- /dev/null
+++ b/tests/data/lexUtf8.in
@@ -0,0 +1 @@
+select * from école
+\ No newline at end of file
diff --git a/tests/data/lexUtf8.out b/tests/data/lexUtf8.out
new file mode 100644
index 0000000..a930264
--- /dev/null
+++ b/tests/data/lexUtf8.out
@@ -0,0 +1 @@
+a:2:{s:5:"lexer";O:15:"SqlParser\Lexer":8:{s:6:"strict";b:0;s:3:"str";O:19:"SqlParser\UtfString":5:{s:3:"str";s:20:"select * from école";s:7:"byteIdx";i:19;s:7:"charIdx";i:18;s:7:"byteLen";i:20;s:7:"charLen";i:19;}s:3:"len";i:19;s:4:"last";i:19;s:4:"list";O:20:"SqlParser\TokensList":3:{s:6:"tokens";a:8:{i:0;O:15:"SqlParser\Token":5:{s:5:"token";s:6:"select";s:5:"value";s:6:"SELECT";s:4:"type";i:1;s:5:"flags";i:3;s:8:"position";i:0;}i:1;O:15:"SqlParser\Token":5:{s:5:"token";s:1:" ";s:5:"value";s:1:" ";s:4:"type";i:3;s:5:"flags";i:0;s:8:"position";i:6;}i:2;O:15:"SqlParser\Token":5:{s:5:"token";s:1:"*";s:5:"value";s:1:"*";s:4:"type";i:2;s:5:"flags";i:1;s:8:"position";i:7;}i:3;O:15:"SqlParser\Token":5:{s:5:"token";s:1:" ";s:5:"value";s:1:" ";s:4:"type";i:3;s:5:"flags";i:0;s:8:"position";i:8;}i:4;O:15:"SqlParser\Token":5:{s:5:"token";s:4:"from";s:5:"value";s:4:"FROM";s:4:"type";i:1;s:5:"flags";i:3;s:8:"position";i:9;}i:5;O:15:"SqlParser\Token":5:{s:5:"token";s:1:" ";s:5:"value";s:1:" ";s:4:"type";i:3;s:5:"flags";i:0;s:8:"position";i:13;}i:6;O:15:"SqlParser\Token":5:{s:5:"token";s:6:"école";s:5:"value";s:6:"école";s:4:"type";i:0;s:5:"flags";i:0;s:8:"position";i:14;}i:7;O:15:"SqlParser\Token":5:{s:5:"token";N;s:5:"value";N;s:4:"type";i:9;s:5:"flags";i:0;s:8:"position";N;}}s:5:"count";i:8;s:3:"idx";i:0;}s:9:"delimiter";s:1:";";s:12:"delimiterLen";i:1;s:6:"errors";a:0:{}}s:6:"errors";a:0:{}}
+\ No newline at end of file
author	Dan Ungureanu <udan1107@gmail.com>	2015-07-16 16:45:04 +0300
committer	Dan Ungureanu <udan1107@gmail.com>	2015-07-16 16:45:04 +0300
commit	9e3a9ee729eaada8585fa8f33ba7d8ac2231495a (patch)
tree	430342d6b6d83dc8d89e5f55d6875451f687f0ee
parent	56ce2d7a37a1ed1d6aabf2b7908ae5da43863497 (diff)
download	sql-parser-9e3a9ee729eaada8585fa8f33ba7d8ac2231495a.zip sql-parser-9e3a9ee729eaada8585fa8f33ba7d8ac2231495a.tar.gz sql-parser-9e3a9ee729eaada8585fa8f33ba7d8ac2231495a.tar.bz2