diff options
author | Dan Ungureanu <udan1107@gmail.com> | 2015-07-16 16:45:04 +0300 |
---|---|---|
committer | Dan Ungureanu <udan1107@gmail.com> | 2015-07-16 16:45:04 +0300 |
commit | 9e3a9ee729eaada8585fa8f33ba7d8ac2231495a (patch) | |
tree | 430342d6b6d83dc8d89e5f55d6875451f687f0ee | |
parent | 56ce2d7a37a1ed1d6aabf2b7908ae5da43863497 (diff) | |
download | sql-parser-9e3a9ee729eaada8585fa8f33ba7d8ac2231495a.zip sql-parser-9e3a9ee729eaada8585fa8f33ba7d8ac2231495a.tar.gz sql-parser-9e3a9ee729eaada8585fa8f33ba7d8ac2231495a.tar.bz2 |
Fixed support for multi-byte strings.
-rw-r--r-- | phpunit.xml | 3 | ||||
-rw-r--r-- | src/Components/OptionsArray.php | 4 | ||||
-rw-r--r-- | src/Context.php | 4 | ||||
-rw-r--r-- | src/Lexer.php | 32 | ||||
-rw-r--r-- | src/UtfString.php | 43 | ||||
-rw-r--r-- | tests/Lexer/LexerTest.php | 1 | ||||
-rw-r--r-- | tests/Misc/UtfStringTest.php | 88 | ||||
-rw-r--r-- | tests/data/lexUtf8.in | 1 | ||||
-rw-r--r-- | tests/data/lexUtf8.out | 1 |
9 files changed, 134 insertions, 43 deletions
diff --git a/phpunit.xml b/phpunit.xml index ea82742..5d34349 100644 --- a/phpunit.xml +++ b/phpunit.xml @@ -22,6 +22,9 @@ <testsuite name="Lexer"> <directory suffix=".php">./tests/Lexer</directory> </testsuite> + <testsuite name="Misc"> + <directory suffix=".php">./tests/Misc</directory> + </testsuite> <testsuite name="Parser"> <directory suffix=".php">./tests/Parser</directory> </testsuite> diff --git a/src/Components/OptionsArray.php b/src/Components/OptionsArray.php index a6b612d..487eefd 100644 --- a/src/Components/OptionsArray.php +++ b/src/Components/OptionsArray.php @@ -201,8 +201,8 @@ class OptionsArray extends Component $list, empty($lastOption[2]) ? array() : $lastOption[2] ); - $ret->options[$lastOptionId]['value'] = - $ret->options[$lastOptionId]['expr']->expr; + $ret->options[$lastOptionId]['value'] + = $ret->options[$lastOptionId]['expr']->expr; $lastOption = null; $state = 0; } else { diff --git a/src/Context.php b/src/Context.php index efcb8df..dd912fb 100644 --- a/src/Context.php +++ b/src/Context.php @@ -397,7 +397,9 @@ abstract class Context */ public static function isSeparator($str) { - return !ctype_alnum($str) && $str !== '_'; + // NOTES: Only ASCII characters may be separators. + // `~` is the last printable ASCII character. + return ($str <= '~') && (!ctype_alnum($str)) && ($str !== '_'); } /** diff --git a/src/Lexer.php b/src/Lexer.php index d70641f..6e8c0fd 100644 --- a/src/Lexer.php +++ b/src/Lexer.php @@ -13,6 +13,16 @@ namespace SqlParser; use SqlParser\Exceptions\LexerException; +if (!defined('USE_UTF_STRINGS')) { + + /** + * Forces usage of `UtfString` if the string is multibyte. + * `UtfString` may be slower, but it gives better results. + * @var bool + */ + define('USE_UTF_STRINGS', true); +} + /** * Performs lexical analysis over a SQL statement and splits it in multiple * tokens. @@ -149,11 +159,27 @@ class Lexer */ public function __construct($str, $strict = false) { - $this->str = $str; - $this->len = ($str instanceof UtfString) ? - $str->length() : strlen($str); + // For multi-byte strings, a new instance of `UtfString` is + // initialized (only if `UtfString` usage is forced. + if (!($str instanceof UtfString)) { + $len = strlen($str); + if ((USE_UTF_STRINGS) && ($len != mb_strlen($str))) { + $str = new UtfString($str); + } + } + + if ($str instanceof UtfString) { + $this->str = $str; + $this->len = $str->length(); + } else { + $this->str = $str; + // `strlen` is used instead of `mb_strlen` because the lexer + // needs to parse each byte of the input. + $this->len = $len; + } $this->strict = $strict; + // Setting the delimiter. $this->delimiter = static::$DEFAULT_DELIMITER; $this->lex(); diff --git a/src/UtfString.php b/src/UtfString.php index 27e3f93..1e863a4 100644 --- a/src/UtfString.php +++ b/src/UtfString.php @@ -93,7 +93,7 @@ class UtfString implements \ArrayAccess */ public function offsetExists($offset) { - return $offset < $this->charLen; + return ($offset >= 0) && ($offset < $this->charLen); } /** @@ -190,26 +190,13 @@ class UtfString implements \ArrayAccess return 3; } elseif ($byte < 248) { return 4; - } elseif ($byte === 252) { + } elseif ($byte < 252) { return 5; // unofficial } return 6; // unofficial } /** - * Returns the number of remaining characters. - * - * @return int - */ - public function remaining() - { - if ($this->charIdx < $this->charLen) { - return $this->charLen - $this->charIdx; - } - return 0; - } - - /** * Returns the length in characters of the string. * * @return int @@ -220,30 +207,12 @@ class UtfString implements \ArrayAccess } /** - * Gets the values of the indexes. - * - * @param int &$byte Reference to the byte index. - * @param int &$char Reference to the character index. - * - * @return void - */ - public function getIndexes(&$byte, &$char) - { - $byte = $this->byteIdx; - $char = $this->charIdx; - } - - /** - * Sets the values of the indexes. + * Returns the contained string. * - * @param int $byte The byte index. - * @param int $char The character index. - * - * @return void + * @return strin */ - public function setIndexes($byte = 0, $char = 0) + public function __toString() { - $this->byteIdx = $byte; - $this->charIdx = $char; + return $this->str; } } diff --git a/tests/Lexer/LexerTest.php b/tests/Lexer/LexerTest.php index dabe64d..cd8718c 100644 --- a/tests/Lexer/LexerTest.php +++ b/tests/Lexer/LexerTest.php @@ -51,6 +51,7 @@ class LexerTest extends TestCase { return array( array('lex'), + array('lexUtf8'), array('lexBool'), array('lexComment'), array('lexDelimiter'), diff --git a/tests/Misc/UtfStringTest.php b/tests/Misc/UtfStringTest.php new file mode 100644 index 0000000..0d1ff78 --- /dev/null +++ b/tests/Misc/UtfStringTest.php @@ -0,0 +1,88 @@ +<?php + +namespace SqlParser\Tests\Misc; + +use SqlParser\UtfString; + +use SqlParser\Tests\TestCase; + +class UtfStringTest extends TestCase +{ + + /** + * Sample phrase in French. + * + * @var UtfString + */ + const TEST_PHRASE = 'Les naïfs ægithales hâtifs pondant à Noël où il ' . + 'gèle sont sûrs d\'être déçus en voyant leurs drôles d\'œufs abîmés.'; + + /** + * The length of the sample phrase. + * + * @var int + */ + const TEST_PHRASE_LEN = 113; + + public function testArrayAccess() + { + $str = new UtfString(static::TEST_PHRASE); + + // offsetExists + $this->assertTrue(isset($str[static::TEST_PHRASE_LEN - 1])); + $this->assertFalse(isset($str[-1])); + $this->assertFalse(isset($str[static::TEST_PHRASE_LEN])); + + // offsetGet + $this->assertEquals('.', $str[static::TEST_PHRASE_LEN - 1]); + $this->assertEquals(null, $str[-1]); + $this->assertEquals(null, $str[static::TEST_PHRASE_LEN]); + } + + /** + * @expectedException \Exception + * @expectedExceptionMessage Not implemented. + */ + public function testSet() + { + $str = new UtfString(''); + $str[0] = 'a'; + } + + /** + * @expectedException \Exception + * @expectedExceptionMessage Not implemented. + */ + public function testUnset() + { + $str = new UtfString(''); + unset($str[0]); + } + + public function testGetCharLength() + { + $this->assertEquals(1, UtfString::getCharLength(chr(0x00))); // 00000000 + $this->assertEquals(1, UtfString::getCharLength(chr(0x7F))); // 01111111 + + $this->assertEquals(2, UtfString::getCharLength(chr(0xC0))); // 11000000 + $this->assertEquals(2, UtfString::getCharLength(chr(0xDF))); // 11011111 + + $this->assertEquals(3, UtfString::getCharLength(chr(0xE0))); // 11100000 + $this->assertEquals(3, UtfString::getCharLength(chr(0xEF))); // 11101111 + + $this->assertEquals(4, UtfString::getCharLength(chr(0xF0))); // 11110000 + $this->assertEquals(4, UtfString::getCharLength(chr(0xF7))); // 11110111 + + $this->assertEquals(5, UtfString::getCharLength(chr(0xF8))); // 11111000 + $this->assertEquals(5, UtfString::getCharLength(chr(0xFB))); // 11111011 + + $this->assertEquals(6, UtfString::getCharLength(chr(0xFC))); // 11111100 + $this->assertEquals(6, UtfString::getCharLength(chr(0xFD))); // 11111101 + } + + public function testToString() + { + $str = new UtfString(static::TEST_PHRASE); + $this->assertEquals(static::TEST_PHRASE, (string) $str); + } +} diff --git a/tests/data/lexUtf8.in b/tests/data/lexUtf8.in new file mode 100644 index 0000000..9cf478e --- /dev/null +++ b/tests/data/lexUtf8.in @@ -0,0 +1 @@ +select * from école
\ No newline at end of file diff --git a/tests/data/lexUtf8.out b/tests/data/lexUtf8.out new file mode 100644 index 0000000..a930264 --- /dev/null +++ b/tests/data/lexUtf8.out @@ -0,0 +1 @@ +a:2:{s:5:"lexer";O:15:"SqlParser\Lexer":8:{s:6:"strict";b:0;s:3:"str";O:19:"SqlParser\UtfString":5:{s:3:"str";s:20:"select * from école";s:7:"byteIdx";i:19;s:7:"charIdx";i:18;s:7:"byteLen";i:20;s:7:"charLen";i:19;}s:3:"len";i:19;s:4:"last";i:19;s:4:"list";O:20:"SqlParser\TokensList":3:{s:6:"tokens";a:8:{i:0;O:15:"SqlParser\Token":5:{s:5:"token";s:6:"select";s:5:"value";s:6:"SELECT";s:4:"type";i:1;s:5:"flags";i:3;s:8:"position";i:0;}i:1;O:15:"SqlParser\Token":5:{s:5:"token";s:1:" ";s:5:"value";s:1:" ";s:4:"type";i:3;s:5:"flags";i:0;s:8:"position";i:6;}i:2;O:15:"SqlParser\Token":5:{s:5:"token";s:1:"*";s:5:"value";s:1:"*";s:4:"type";i:2;s:5:"flags";i:1;s:8:"position";i:7;}i:3;O:15:"SqlParser\Token":5:{s:5:"token";s:1:" ";s:5:"value";s:1:" ";s:4:"type";i:3;s:5:"flags";i:0;s:8:"position";i:8;}i:4;O:15:"SqlParser\Token":5:{s:5:"token";s:4:"from";s:5:"value";s:4:"FROM";s:4:"type";i:1;s:5:"flags";i:3;s:8:"position";i:9;}i:5;O:15:"SqlParser\Token":5:{s:5:"token";s:1:" ";s:5:"value";s:1:" ";s:4:"type";i:3;s:5:"flags";i:0;s:8:"position";i:13;}i:6;O:15:"SqlParser\Token":5:{s:5:"token";s:6:"école";s:5:"value";s:6:"école";s:4:"type";i:0;s:5:"flags";i:0;s:8:"position";i:14;}i:7;O:15:"SqlParser\Token":5:{s:5:"token";N;s:5:"value";N;s:4:"type";i:9;s:5:"flags";i:0;s:8:"position";N;}}s:5:"count";i:8;s:3:"idx";i:0;}s:9:"delimiter";s:1:";";s:12:"delimiterLen";i:1;s:6:"errors";a:0:{}}s:6:"errors";a:0:{}}
\ No newline at end of file |