summaryrefslogtreecommitdiffstats
path: root/lib/SqlFormatter.php
diff options
context:
space:
mode:
Diffstat (limited to 'lib/SqlFormatter.php')
-rw-r--r--lib/SqlFormatter.php260
1 files changed, 117 insertions, 143 deletions
diff --git a/lib/SqlFormatter.php b/lib/SqlFormatter.php
index e66b23b..1d89d51 100644
--- a/lib/SqlFormatter.php
+++ b/lib/SqlFormatter.php
@@ -5,40 +5,41 @@
*
* @package SqlFormatter
* @author Jeremy Dorn <jeremy@jeremydorn.com>
+ * @author Florin Patan <florinpatan@gmail.com>
* @copyright 2012 Jeremy Dorn
* @license http://www.opensource.org/licenses/lgpl-license.php LGPL
* @link http://github.com/jdorn/sql-formatter
* @version 1.2.0
*/
class SqlFormatter
-{
+{
// Reserved words (for syntax highlighting)
protected static $reserved = array(
'ACCESSIBLE', 'ACTION', 'ADD', 'AFTER', 'AGAINST', 'AGGREGATE', 'ALGORITHM', 'ALL', 'ALTER', 'ANALYSE', 'ANALYZE', 'AND', 'AS', 'ASC',
- 'AUTOCOMMIT', 'AUTO_INCREMENT', 'AVG_ROW_LENGTH', 'BACKUP', 'BEGIN', 'BETWEEN', 'BINLOG', 'BOTH', 'BY', 'CASCADE', 'CASE', 'CHANGE', 'CHANGED',
+ 'AUTOCOMMIT', 'AUTO_INCREMENT', 'AVG_ROW_LENGTH', 'BACKUP', 'BEGIN', 'BETWEEN', 'BINLOG', 'BOTH', 'CASCADE', 'CASE', 'CHANGE', 'CHANGED',
'CHARSET', 'CHECK', 'CHECKSUM', 'COLLATE', 'COLLATION', 'COLUMN', 'COLUMNS', 'COMMENT', 'COMMIT', 'COMMITTED', 'COMPRESSED', 'CONCURRENT',
'CONSTRAINT', 'CONTAINS', 'CONVERT', 'COUNT', 'CREATE', 'CROSS', 'CURRENT_TIMESTAMP', 'DATABASE', 'DATABASES', 'DAY', 'DAY_HOUR', 'DAY_MINUTE',
'DAY_SECOND', 'DEFINER', 'DELAYED', 'DELAY_KEY_WRITE', 'DELETE', 'DESC', 'DESCRIBE', 'DETERMINISTIC', 'DISTINCT', 'DISTINCTROW', 'DIV',
'DO', 'DROP', 'DUMPFILE', 'DUPLICATE', 'DYNAMIC', 'ELSE', 'ENCLOSED', 'END', 'ENGINE', 'ENGINES', 'ESCAPE', 'ESCAPED', 'EVENTS', 'EXECUTE',
- 'EXISTS', 'EXPLAIN', 'EXTENDED', 'FAST', 'FIELDS', 'FILE', 'FIRST', 'FIXED', 'FLUSH', 'FOR', 'FORCE', 'FOREIGN', 'FROM', 'FULL', 'FULLTEXT',
- 'FUNCTION', 'GEMINI', 'GEMINI_SPIN_RETRIES', 'GLOBAL', 'GRANT', 'GRANTS', 'GROUP', 'GROUP_CONCAT', 'GROUP BY', 'HAVING', 'HEAP', 'HIGH_PRIORITY', 'HOSTS', 'HOUR', 'HOUR_MINUTE',
- 'HOUR_SECOND', 'IDENTIFIED', 'IF', 'IGNORE', 'IN', 'INDEX', 'INDEXES', 'INFILE', 'INNER', 'INNER JOIN', 'INSERT', 'INSERT_ID', 'INSERT_METHOD', 'INTERVAL',
- 'INTO', 'INVOKER', 'IS', 'ISOLATION', 'JOIN', 'KEY', 'KEYS', 'KILL', 'LAST_INSERT_ID', 'LEADING', 'LEFT', 'LEFT JOIN', 'LEVEL', 'LIKE', 'LIMIT', 'LINEAR',
+ 'EXISTS', 'EXPLAIN', 'EXTENDED', 'FAST', 'FIELDS', 'FILE', 'FIRST', 'FIXED', 'FLUSH', 'FOR', 'FORCE', 'FOREIGN', 'FULL', 'FULLTEXT',
+ 'FUNCTION', 'GEMINI', 'GEMINI_SPIN_RETRIES', 'GLOBAL', 'GRANT', 'GRANTS', 'GROUP_CONCAT', 'HEAP', 'HIGH_PRIORITY', 'HOSTS', 'HOUR', 'HOUR_MINUTE',
+ 'HOUR_SECOND', 'IDENTIFIED', 'IF', 'IGNORE', 'IN', 'INDEX', 'INDEXES', 'INFILE', 'INSERT', 'INSERT_ID', 'INSERT_METHOD', 'INTERVAL',
+ 'INTO', 'INVOKER', 'IS', 'ISOLATION', 'KEY', 'KEYS', 'KILL', 'LAST_INSERT_ID', 'LEADING', 'LEVEL', 'LIKE', 'LINEAR',
'LINES', 'LOAD', 'LOCAL', 'LOCK', 'LOCKS', 'LOGS', 'LOW_PRIORITY', 'MARIA', 'MASTER', 'MASTER_CONNECT_RETRY', 'MASTER_HOST', 'MASTER_LOG_FILE',
'MASTER_LOG_POS', 'MASTER_PASSWORD', 'MASTER_PORT', 'MASTER_USER', 'MATCH', 'MAX_CONNECTIONS_PER_HOUR', 'MAX_QUERIES_PER_HOUR',
'MAX_ROWS', 'MAX_UPDATES_PER_HOUR', 'MAX_USER_CONNECTIONS', 'MEDIUM', 'MERGE', 'MINUTE', 'MINUTE_SECOND', 'MIN_ROWS', 'MODE', 'MODIFY',
'MONTH', 'MRG_MYISAM', 'MYISAM', 'NAMES', 'NATURAL', 'NOT', 'NOW', 'NULL', 'OFFSET', 'ON', 'OPEN', 'OPTIMIZE', 'OPTION', 'OPTIONALLY', 'OR',
- 'ORDER', 'ORDER BY', 'OUTER', 'OUTER JOIN', 'OUTFILE', 'PACK_KEYS', 'PAGE', 'PARTIAL', 'PARTITION', 'PARTITIONS', 'PASSWORD', 'PRIMARY', 'PRIVILEGES', 'PROCEDURE',
+ 'OUTFILE', 'PACK_KEYS', 'PAGE', 'PARTIAL', 'PARTITION', 'PARTITIONS', 'PASSWORD', 'PRIMARY', 'PRIVILEGES', 'PROCEDURE',
'PROCESS', 'PROCESSLIST', 'PURGE', 'QUICK', 'RAID0', 'RAID_CHUNKS', 'RAID_CHUNKSIZE', 'RAID_TYPE', 'RANGE', 'READ', 'READ_ONLY',
'READ_WRITE', 'REFERENCES', 'REGEXP', 'RELOAD', 'RENAME', 'REPAIR', 'REPEATABLE', 'REPLACE', 'REPLICATION', 'RESET', 'RESTORE', 'RESTRICT',
- 'RETURN', 'RETURNS', 'REVOKE', 'RIGHT', 'RIGHT JOIN', 'RLIKE', 'ROLLBACK', 'ROW', 'ROWS', 'ROW_FORMAT', 'SECOND', 'SECURITY', 'SELECT', 'SEPARATOR',
+ 'RETURN', 'RETURNS', 'REVOKE', 'RLIKE', 'ROLLBACK', 'ROW', 'ROWS', 'ROW_FORMAT', 'SECOND', 'SECURITY', 'SEPARATOR',
'SERIALIZABLE', 'SESSION', 'SET', 'SHARE', 'SHOW', 'SHUTDOWN', 'SLAVE', 'SONAME', 'SOUNDS', 'SQL', 'SQL_AUTO_IS_NULL', 'SQL_BIG_RESULT',
'SQL_BIG_SELECTS', 'SQL_BIG_TABLES', 'SQL_BUFFER_RESULT', 'SQL_CACHE', 'SQL_CALC_FOUND_ROWS', 'SQL_LOG_BIN', 'SQL_LOG_OFF',
'SQL_LOG_UPDATE', 'SQL_LOW_PRIORITY_UPDATES', 'SQL_MAX_JOIN_SIZE', 'SQL_NO_CACHE', 'SQL_QUOTE_SHOW_CREATE', 'SQL_SAFE_UPDATES',
'SQL_SELECT_LIMIT', 'SQL_SLAVE_SKIP_COUNTER', 'SQL_SMALL_RESULT', 'SQL_WARNINGS', 'START', 'STARTING', 'STATUS', 'STOP', 'STORAGE',
'STRAIGHT_JOIN', 'STRING', 'STRIPED', 'SUPER', 'TABLE', 'TABLES', 'TEMPORARY', 'TERMINATED', 'THEN', 'TO', 'TRAILING', 'TRANSACTIONAL',
- 'TRUNCATE', 'TYPE', 'TYPES', 'UNCOMMITTED', 'UNION', 'UNIQUE', 'UNLOCK', 'UPDATE', 'USAGE', 'USE', 'USING', 'VALUES', 'VARIABLES',
- 'VIEW', 'WHEN', 'WHERE', 'WITH', 'WORK', 'WRITE', 'XOR', 'YEAR_MONTH'
+ 'TRUNCATE', 'TYPE', 'TYPES', 'UNCOMMITTED', 'UNION', 'UNIQUE', 'UNLOCK', 'USAGE', 'USE', 'USING', 'VARIABLES',
+ 'VIEW', 'WHEN', 'WITH', 'WORK', 'WRITE', 'XOR', 'YEAR_MONTH'
);
// For SQL formatting
@@ -72,9 +73,11 @@ class SqlFormatter
// This flag tells us if SqlFormatted has been initialized
protected static $init;
-
- // This is a combination of all the boundary characters and all the whitespace characters
- protected static $all_boundaries;
+
+ // Regular expressions for tokenizing
+ protected static $regex_boundaries;
+ protected static $regex_reserved;
+ protected static $regex_special_reserved;
//cache variables
//Only tokens shorter than this size will be cached. Somewhere between 10 and 20 seems to work well for most cases.
@@ -96,6 +99,20 @@ class SqlFormatter
);
}
+ protected static function init() {
+ if(self::$init) return;
+
+ //Sort reserved word list from longest word to shortest
+ usort(self::$reserved, array('SqlFormatter', 'sortLength'));
+
+ //set up regular expressions
+ self::$regex_boundaries = '('.implode('|',array_map(array('SqlFormatter', 'quote_regex'),self::$boundaries)).')';
+ self::$regex_reserved = '('.implode('|',array_map(array('SqlFormatter', 'quote_regex'),self::$reserved)).')';
+ self::$regex_special_reserved = '('.implode('|',array_map(array('SqlFormatter', 'quote_regex'),self::$special_reserved)).')';
+
+ self::$init = true;
+ }
+
/**
* Return the next token and token type in a SQL string.
* Quoted strings, comments, reserved words, whitespace, and punctuation are all their own tokens.
@@ -107,7 +124,15 @@ class SqlFormatter
*/
protected static function getNextToken($string, $previous = null)
{
- // If the next token is a comment
+ // Whitespace
+ if (preg_match('/^\s+/',$string,$matches)) {
+ return array(
+ 'token' => $matches[0],
+ 'type'=>'whitespace'
+ );
+ }
+
+ // Comment
if ($string[0] === '#' || substr($string, 0, 2) === '--' || substr($string, 0, 2) === '/*') {
// Comment until end of line
if ($string[0] === '-' || $string[0] === '#') {
@@ -128,158 +153,99 @@ class SqlFormatter
);
}
- // If the next item is a string
- if (in_array($string[0], self::$quotes)) {
- $quote = $string[0];
- for ($i = 1, $length = strlen($string); $i < $length; $i++) {
- $next_char = null;
- if (isset($string[$i + 1])) {
- $next_char = $string[$i + 1];
- }
-
- // Escaped (either backslash or backtick escaped)
- if (($quote !== '`' && $string[$i] === '\\') || ($quote === '`' && $string[$i] === '`' && $next_char === '`')) {
- $i++;
- } elseif ($string[$i] === $quote) {
- break;
- }
- }
- if ($quote === '`') {
- $type = 'backtick quote';
- } else {
- $type = 'quote';
- }
-
+ // Double Quoted String
+ if( preg_match('/^("(?:[^"\\\\]|\\\\.)*")/', $string, $matches)) {
return array(
- 'token' => substr($string, 0, $i + 1),
- 'type' => $type
+ 'token'=>$matches[1],
+ 'type'=>'quote'
);
}
-
- // Separators
- if (in_array($string[0], self::$boundaries)) {
- // If it is a simple string or empty between the parentheses, just count as a word
- // this makes it so we don't split things like NOW() or COUNT(*) into separate lines
- if ($string[0] === '(') {
- // "()"
- if (isset($string[1]) && $string[1] === ')') {
- return array(
- 'token' => '()',
- 'type' => 'word'
- );
- }
-
- // "(word/whitespace/boundary)"
- $next_token = self::getNextToken(substr($string, 1));
- $length = strlen($next_token['token']);
- if (isset($string[$length + 1]) && $string[$length + 1] === ')') {
- if ($next_token['type'] === 'word' || $next_token['type'] === 'whitespace' || $next_token['type'] === 'boundary') {
- return array(
- 'token' => '(' . $next_token['token'] . ')',
- 'type' => 'word'
- );
- }
- }
- }
-
- //return single parentheses as their own token
- if ($string[0] === '(' || $string[0] === ')') {
+
+ // Single Quoted String
+ elseif( preg_match('/^(\'(?:[^\'\\\\]|\\\\.)*\')/', $string, $matches)) {
+ return array(
+ 'token'=>$matches[1],
+ 'type'=>'quote'
+ );
+ }
+
+ // Backtick Quoted String
+ elseif( preg_match('/^(`(``|[^`])+`)/', $string, $matches)) {
+ return array(
+ 'token'=>$matches[1],
+ 'type'=>'backtick quote'
+ );
+ }
+
+ // Number followed by boundary
+ if(preg_match('/^([0-9]+(\.[0-9]+)?)($|\s|'.self::$regex_boundaries.')/',$string,$matches)) {
+ return array(
+ 'token' => $matches[1],
+ 'type'=>'number'
+ );
+ }
+
+ // Open Parentheses
+ if ($string[0] === '(') {
+ // Parentheses with something simple between them
+ // Examples: "()", "( word )", "(*)"
+ // If it is anything more complex, it will be split into multiple lines and indented
+ if(preg_match('/^(\( *([a-zA-Z\*][a-zA-Z_\-0-9\*]*)? *\))/',$string,$matches)) {
return array(
- 'token' => $string[0],
- 'type' => $string[0]
+ 'token'=>$matches[1],
+ 'type'=>'word'
);
}
-
-
- // If there are 1 or more boundary characters together, return as a single word
- $next_token = self::getNextToken(substr($string, 1));
- if ($next_token['type'] === 'boundary') {
+ //otherwise, just return the parentheses by itself
+ else {
return array(
- 'token' => $string[0].$next_token['token'],
- 'type' => 'boundary'
+ 'token'=>'(',
+ 'type'=>'('
);
}
-
- // Otherwise, just return the single boundary character
- if ($string[0] === '.' || $string[0] === ',') {
- $type = $string[0];
- } else {
- $type = 'boundary';
- }
-
+ }
+
+ // Special boundary characters (e.g. ")", ",", ".", ";")
+ if($string[0] === ')' || $string[0] === ',' || $string[0] === '.' || $string[0] === ';') {
return array(
- 'token' => $string[0],
- 'type' => $type
+ 'token'=>$string[0],
+ 'type'=>$string[0]
);
}
- // Whitespace
- if (in_array($string[0], self::$whitespace)) {
- for ($i = 1, $length = strlen($string); $i < $length; $i++) {
- if (!in_array($string[$i], self::$whitespace)) {
- break;
- }
- }
-
+ // Other Boundary Characters
+ if(preg_match('/^('.self::$regex_boundaries.'+)/',$string,$matches)) {
return array(
- 'token' => substr($string, 0, $i),
- 'type' => 'whitespace'
+ 'token' => $matches[1],
+ 'type' => 'boundary'
);
}
- if (!self::$init) {
- //Sort reserved word list from longest word to shortest
- usort(self::$reserved, array('SqlFormatter', 'sortLength'));
-
- //Combine boundary characters and whitespace
- self::$all_boundaries = array_merge(self::$boundaries, self::$whitespace);
-
- self::$init = true;
- }
-
- //a reserved word cannot be preceded by a '.'
- //this makes it so in "mytable.from", "from" is not considered a reserved word
+ // A reserved word cannot be preceded by a '.'
+ // this makes it so in "mytable.from", "from" is not considered a reserved word
if (!$previous || !isset($previous['token']) || $previous['token'] !== '.') {
- // Reserved word
- $test = strtoupper($string);
- foreach (self::$reserved as $word) {
- $length = strlen($word);
- if (substr($test, 0, $length) === $word) {
- if (isset($string[$length]) && !in_array($string[$length], self::$all_boundaries)) {
- continue;
- }
-
- if (in_array($word, self::$special_reserved)) {
- $type = 'special reserved';
- } else {
- $type = 'reserved';
- }
-
- return array(
- 'token' => substr($string, 0, $length),
- 'type' => $type
- );
- }
+ // Special Reserved Word
+ if(preg_match('/^('.self::$regex_special_reserved.')($|\s|'.self::$regex_boundaries.')/', strtoupper($string),$matches)) {
+ return array(
+ 'type'=>'special reserved',
+ 'token'=>substr($string,0,strlen($matches[1]))
+ );
}
- }
-
- // Look for first word separator
- for ($i = 1, $length = strlen($string); $i < $length; $i++) {
- if (in_array($string[$i], self::$all_boundaries)) {
- break;
+ // Other Reserved Word
+ elseif(preg_match('/^('.self::$regex_reserved.')($|\s|'.self::$regex_boundaries.')/', strtoupper($string),$matches)) {
+ return array(
+ 'type'=>'reserved',
+ 'token'=>substr($string,0,strlen($matches[1]))
+ );
}
}
- $ret = substr($string, 0, $i);
- if (is_numeric($ret)) {
- $type = 'number';
- } else {
- $type = 'word';
- }
+ // Non reserved word
+ preg_match('/^(.*?)($|\s|'.self::$regex_boundaries.')/',$string,$matches);
return array(
- 'token' => $ret,
- 'type' => $type
+ 'token' => $matches[1],
+ 'type' => 'word'
);
}
@@ -295,6 +261,8 @@ class SqlFormatter
*/
protected static function tokenize($string)
{
+ self::init();
+
$tokens = array();
//used for debugging if there is an error while tokenizing the string
@@ -332,7 +300,7 @@ class SqlFormatter
}
else {
// Get the next token and the token type
- $token = self::getNextToken($string, $token);
+ $token = self::getNextToken($string, $token);
$token_length = strlen($token['token']);
self::$cache_misses++;
@@ -592,6 +560,7 @@ class SqlFormatter
case 'boundary':
case '.':
case ',':
+ case ';':
return self::highlightBoundary($token);
case 'comment':
case 'block comment':
@@ -702,4 +671,9 @@ class SqlFormatter
{
return strlen($b) - strlen($a);
}
+
+ private static function quote_regex($a) {
+ //don't escape numbers or letters
+ return preg_replace('/([\,\;\)\(\.\=\<\>\+\-\*\/\!\^\\\%\|\&\[\]\?])/','\\\\$0',$a);
+ }
}