summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDan Ungureanu <udan1107@gmail.com>2015-07-16 16:45:04 +0300
committerDan Ungureanu <udan1107@gmail.com>2015-07-16 16:45:04 +0300
commit9e3a9ee729eaada8585fa8f33ba7d8ac2231495a (patch)
tree430342d6b6d83dc8d89e5f55d6875451f687f0ee
parent56ce2d7a37a1ed1d6aabf2b7908ae5da43863497 (diff)
downloadsql-parser-9e3a9ee729eaada8585fa8f33ba7d8ac2231495a.zip
sql-parser-9e3a9ee729eaada8585fa8f33ba7d8ac2231495a.tar.gz
sql-parser-9e3a9ee729eaada8585fa8f33ba7d8ac2231495a.tar.bz2
Fixed support for multi-byte strings.
-rw-r--r--phpunit.xml3
-rw-r--r--src/Components/OptionsArray.php4
-rw-r--r--src/Context.php4
-rw-r--r--src/Lexer.php32
-rw-r--r--src/UtfString.php43
-rw-r--r--tests/Lexer/LexerTest.php1
-rw-r--r--tests/Misc/UtfStringTest.php88
-rw-r--r--tests/data/lexUtf8.in1
-rw-r--r--tests/data/lexUtf8.out1
9 files changed, 134 insertions, 43 deletions
diff --git a/phpunit.xml b/phpunit.xml
index ea82742..5d34349 100644
--- a/phpunit.xml
+++ b/phpunit.xml
@@ -22,6 +22,9 @@
<testsuite name="Lexer">
<directory suffix=".php">./tests/Lexer</directory>
</testsuite>
+ <testsuite name="Misc">
+ <directory suffix=".php">./tests/Misc</directory>
+ </testsuite>
<testsuite name="Parser">
<directory suffix=".php">./tests/Parser</directory>
</testsuite>
diff --git a/src/Components/OptionsArray.php b/src/Components/OptionsArray.php
index a6b612d..487eefd 100644
--- a/src/Components/OptionsArray.php
+++ b/src/Components/OptionsArray.php
@@ -201,8 +201,8 @@ class OptionsArray extends Component
$list,
empty($lastOption[2]) ? array() : $lastOption[2]
);
- $ret->options[$lastOptionId]['value'] =
- $ret->options[$lastOptionId]['expr']->expr;
+ $ret->options[$lastOptionId]['value']
+ = $ret->options[$lastOptionId]['expr']->expr;
$lastOption = null;
$state = 0;
} else {
diff --git a/src/Context.php b/src/Context.php
index efcb8df..dd912fb 100644
--- a/src/Context.php
+++ b/src/Context.php
@@ -397,7 +397,9 @@ abstract class Context
*/
public static function isSeparator($str)
{
- return !ctype_alnum($str) && $str !== '_';
+ // NOTES: Only ASCII characters may be separators.
+ // `~` is the last printable ASCII character.
+ return ($str <= '~') && (!ctype_alnum($str)) && ($str !== '_');
}
/**
diff --git a/src/Lexer.php b/src/Lexer.php
index d70641f..6e8c0fd 100644
--- a/src/Lexer.php
+++ b/src/Lexer.php
@@ -13,6 +13,16 @@ namespace SqlParser;
use SqlParser\Exceptions\LexerException;
+if (!defined('USE_UTF_STRINGS')) {
+
+ /**
+ * Forces usage of `UtfString` if the string is multibyte.
+ * `UtfString` may be slower, but it gives better results.
+ * @var bool
+ */
+ define('USE_UTF_STRINGS', true);
+}
+
/**
* Performs lexical analysis over a SQL statement and splits it in multiple
* tokens.
@@ -149,11 +159,27 @@ class Lexer
*/
public function __construct($str, $strict = false)
{
- $this->str = $str;
- $this->len = ($str instanceof UtfString) ?
- $str->length() : strlen($str);
+ // For multi-byte strings, a new instance of `UtfString` is
+ // initialized (only if `UtfString` usage is forced.
+ if (!($str instanceof UtfString)) {
+ $len = strlen($str);
+ if ((USE_UTF_STRINGS) && ($len != mb_strlen($str))) {
+ $str = new UtfString($str);
+ }
+ }
+
+ if ($str instanceof UtfString) {
+ $this->str = $str;
+ $this->len = $str->length();
+ } else {
+ $this->str = $str;
+ // `strlen` is used instead of `mb_strlen` because the lexer
+ // needs to parse each byte of the input.
+ $this->len = $len;
+ }
$this->strict = $strict;
+ // Setting the delimiter.
$this->delimiter = static::$DEFAULT_DELIMITER;
$this->lex();
diff --git a/src/UtfString.php b/src/UtfString.php
index 27e3f93..1e863a4 100644
--- a/src/UtfString.php
+++ b/src/UtfString.php
@@ -93,7 +93,7 @@ class UtfString implements \ArrayAccess
*/
public function offsetExists($offset)
{
- return $offset < $this->charLen;
+ return ($offset >= 0) && ($offset < $this->charLen);
}
/**
@@ -190,26 +190,13 @@ class UtfString implements \ArrayAccess
return 3;
} elseif ($byte < 248) {
return 4;
- } elseif ($byte === 252) {
+ } elseif ($byte < 252) {
return 5; // unofficial
}
return 6; // unofficial
}
/**
- * Returns the number of remaining characters.
- *
- * @return int
- */
- public function remaining()
- {
- if ($this->charIdx < $this->charLen) {
- return $this->charLen - $this->charIdx;
- }
- return 0;
- }
-
- /**
* Returns the length in characters of the string.
*
* @return int
@@ -220,30 +207,12 @@ class UtfString implements \ArrayAccess
}
/**
- * Gets the values of the indexes.
- *
- * @param int &$byte Reference to the byte index.
- * @param int &$char Reference to the character index.
- *
- * @return void
- */
- public function getIndexes(&$byte, &$char)
- {
- $byte = $this->byteIdx;
- $char = $this->charIdx;
- }
-
- /**
- * Sets the values of the indexes.
+ * Returns the contained string.
*
- * @param int $byte The byte index.
- * @param int $char The character index.
- *
- * @return void
+ * @return strin
*/
- public function setIndexes($byte = 0, $char = 0)
+ public function __toString()
{
- $this->byteIdx = $byte;
- $this->charIdx = $char;
+ return $this->str;
}
}
diff --git a/tests/Lexer/LexerTest.php b/tests/Lexer/LexerTest.php
index dabe64d..cd8718c 100644
--- a/tests/Lexer/LexerTest.php
+++ b/tests/Lexer/LexerTest.php
@@ -51,6 +51,7 @@ class LexerTest extends TestCase
{
return array(
array('lex'),
+ array('lexUtf8'),
array('lexBool'),
array('lexComment'),
array('lexDelimiter'),
diff --git a/tests/Misc/UtfStringTest.php b/tests/Misc/UtfStringTest.php
new file mode 100644
index 0000000..0d1ff78
--- /dev/null
+++ b/tests/Misc/UtfStringTest.php
@@ -0,0 +1,88 @@
+<?php
+
+namespace SqlParser\Tests\Misc;
+
+use SqlParser\UtfString;
+
+use SqlParser\Tests\TestCase;
+
+class UtfStringTest extends TestCase
+{
+
+ /**
+ * Sample phrase in French.
+ *
+ * @var UtfString
+ */
+ const TEST_PHRASE = 'Les naïfs ægithales hâtifs pondant à Noël où il ' .
+ 'gèle sont sûrs d\'être déçus en voyant leurs drôles d\'œufs abîmés.';
+
+ /**
+ * The length of the sample phrase.
+ *
+ * @var int
+ */
+ const TEST_PHRASE_LEN = 113;
+
+ public function testArrayAccess()
+ {
+ $str = new UtfString(static::TEST_PHRASE);
+
+ // offsetExists
+ $this->assertTrue(isset($str[static::TEST_PHRASE_LEN - 1]));
+ $this->assertFalse(isset($str[-1]));
+ $this->assertFalse(isset($str[static::TEST_PHRASE_LEN]));
+
+ // offsetGet
+ $this->assertEquals('.', $str[static::TEST_PHRASE_LEN - 1]);
+ $this->assertEquals(null, $str[-1]);
+ $this->assertEquals(null, $str[static::TEST_PHRASE_LEN]);
+ }
+
+ /**
+ * @expectedException \Exception
+ * @expectedExceptionMessage Not implemented.
+ */
+ public function testSet()
+ {
+ $str = new UtfString('');
+ $str[0] = 'a';
+ }
+
+ /**
+ * @expectedException \Exception
+ * @expectedExceptionMessage Not implemented.
+ */
+ public function testUnset()
+ {
+ $str = new UtfString('');
+ unset($str[0]);
+ }
+
+ public function testGetCharLength()
+ {
+ $this->assertEquals(1, UtfString::getCharLength(chr(0x00))); // 00000000
+ $this->assertEquals(1, UtfString::getCharLength(chr(0x7F))); // 01111111
+
+ $this->assertEquals(2, UtfString::getCharLength(chr(0xC0))); // 11000000
+ $this->assertEquals(2, UtfString::getCharLength(chr(0xDF))); // 11011111
+
+ $this->assertEquals(3, UtfString::getCharLength(chr(0xE0))); // 11100000
+ $this->assertEquals(3, UtfString::getCharLength(chr(0xEF))); // 11101111
+
+ $this->assertEquals(4, UtfString::getCharLength(chr(0xF0))); // 11110000
+ $this->assertEquals(4, UtfString::getCharLength(chr(0xF7))); // 11110111
+
+ $this->assertEquals(5, UtfString::getCharLength(chr(0xF8))); // 11111000
+ $this->assertEquals(5, UtfString::getCharLength(chr(0xFB))); // 11111011
+
+ $this->assertEquals(6, UtfString::getCharLength(chr(0xFC))); // 11111100
+ $this->assertEquals(6, UtfString::getCharLength(chr(0xFD))); // 11111101
+ }
+
+ public function testToString()
+ {
+ $str = new UtfString(static::TEST_PHRASE);
+ $this->assertEquals(static::TEST_PHRASE, (string) $str);
+ }
+}
diff --git a/tests/data/lexUtf8.in b/tests/data/lexUtf8.in
new file mode 100644
index 0000000..9cf478e
--- /dev/null
+++ b/tests/data/lexUtf8.in
@@ -0,0 +1 @@
+select * from école \ No newline at end of file
diff --git a/tests/data/lexUtf8.out b/tests/data/lexUtf8.out
new file mode 100644
index 0000000..a930264
--- /dev/null
+++ b/tests/data/lexUtf8.out
@@ -0,0 +1 @@
+a:2:{s:5:"lexer";O:15:"SqlParser\Lexer":8:{s:6:"strict";b:0;s:3:"str";O:19:"SqlParser\UtfString":5:{s:3:"str";s:20:"select * from école";s:7:"byteIdx";i:19;s:7:"charIdx";i:18;s:7:"byteLen";i:20;s:7:"charLen";i:19;}s:3:"len";i:19;s:4:"last";i:19;s:4:"list";O:20:"SqlParser\TokensList":3:{s:6:"tokens";a:8:{i:0;O:15:"SqlParser\Token":5:{s:5:"token";s:6:"select";s:5:"value";s:6:"SELECT";s:4:"type";i:1;s:5:"flags";i:3;s:8:"position";i:0;}i:1;O:15:"SqlParser\Token":5:{s:5:"token";s:1:" ";s:5:"value";s:1:" ";s:4:"type";i:3;s:5:"flags";i:0;s:8:"position";i:6;}i:2;O:15:"SqlParser\Token":5:{s:5:"token";s:1:"*";s:5:"value";s:1:"*";s:4:"type";i:2;s:5:"flags";i:1;s:8:"position";i:7;}i:3;O:15:"SqlParser\Token":5:{s:5:"token";s:1:" ";s:5:"value";s:1:" ";s:4:"type";i:3;s:5:"flags";i:0;s:8:"position";i:8;}i:4;O:15:"SqlParser\Token":5:{s:5:"token";s:4:"from";s:5:"value";s:4:"FROM";s:4:"type";i:1;s:5:"flags";i:3;s:8:"position";i:9;}i:5;O:15:"SqlParser\Token":5:{s:5:"token";s:1:" ";s:5:"value";s:1:" ";s:4:"type";i:3;s:5:"flags";i:0;s:8:"position";i:13;}i:6;O:15:"SqlParser\Token":5:{s:5:"token";s:6:"école";s:5:"value";s:6:"école";s:4:"type";i:0;s:5:"flags";i:0;s:8:"position";i:14;}i:7;O:15:"SqlParser\Token":5:{s:5:"token";N;s:5:"value";N;s:4:"type";i:9;s:5:"flags";i:0;s:8:"position";N;}}s:5:"count";i:8;s:3:"idx";i:0;}s:9:"delimiter";s:1:";";s:12:"delimiterLen";i:1;s:6:"errors";a:0:{}}s:6:"errors";a:0:{}} \ No newline at end of file