diff options
Diffstat (limited to 'lib/SqlFormatter.php')
-rw-r--r-- | lib/SqlFormatter.php | 260 |
1 files changed, 117 insertions, 143 deletions
diff --git a/lib/SqlFormatter.php b/lib/SqlFormatter.php index e66b23b..1d89d51 100644 --- a/lib/SqlFormatter.php +++ b/lib/SqlFormatter.php @@ -5,40 +5,41 @@ * * @package SqlFormatter * @author Jeremy Dorn <jeremy@jeremydorn.com> + * @author Florin Patan <florinpatan@gmail.com> * @copyright 2012 Jeremy Dorn * @license http://www.opensource.org/licenses/lgpl-license.php LGPL * @link http://github.com/jdorn/sql-formatter * @version 1.2.0 */ class SqlFormatter -{ +{ // Reserved words (for syntax highlighting) protected static $reserved = array( 'ACCESSIBLE', 'ACTION', 'ADD', 'AFTER', 'AGAINST', 'AGGREGATE', 'ALGORITHM', 'ALL', 'ALTER', 'ANALYSE', 'ANALYZE', 'AND', 'AS', 'ASC', - 'AUTOCOMMIT', 'AUTO_INCREMENT', 'AVG_ROW_LENGTH', 'BACKUP', 'BEGIN', 'BETWEEN', 'BINLOG', 'BOTH', 'BY', 'CASCADE', 'CASE', 'CHANGE', 'CHANGED', + 'AUTOCOMMIT', 'AUTO_INCREMENT', 'AVG_ROW_LENGTH', 'BACKUP', 'BEGIN', 'BETWEEN', 'BINLOG', 'BOTH', 'CASCADE', 'CASE', 'CHANGE', 'CHANGED', 'CHARSET', 'CHECK', 'CHECKSUM', 'COLLATE', 'COLLATION', 'COLUMN', 'COLUMNS', 'COMMENT', 'COMMIT', 'COMMITTED', 'COMPRESSED', 'CONCURRENT', 'CONSTRAINT', 'CONTAINS', 'CONVERT', 'COUNT', 'CREATE', 'CROSS', 'CURRENT_TIMESTAMP', 'DATABASE', 'DATABASES', 'DAY', 'DAY_HOUR', 'DAY_MINUTE', 'DAY_SECOND', 'DEFINER', 'DELAYED', 'DELAY_KEY_WRITE', 'DELETE', 'DESC', 'DESCRIBE', 'DETERMINISTIC', 'DISTINCT', 'DISTINCTROW', 'DIV', 'DO', 'DROP', 'DUMPFILE', 'DUPLICATE', 'DYNAMIC', 'ELSE', 'ENCLOSED', 'END', 'ENGINE', 'ENGINES', 'ESCAPE', 'ESCAPED', 'EVENTS', 'EXECUTE', - 'EXISTS', 'EXPLAIN', 'EXTENDED', 'FAST', 'FIELDS', 'FILE', 'FIRST', 'FIXED', 'FLUSH', 'FOR', 'FORCE', 'FOREIGN', 'FROM', 'FULL', 'FULLTEXT', - 'FUNCTION', 'GEMINI', 'GEMINI_SPIN_RETRIES', 'GLOBAL', 'GRANT', 'GRANTS', 'GROUP', 'GROUP_CONCAT', 'GROUP BY', 'HAVING', 'HEAP', 'HIGH_PRIORITY', 'HOSTS', 'HOUR', 'HOUR_MINUTE', - 'HOUR_SECOND', 'IDENTIFIED', 'IF', 'IGNORE', 'IN', 'INDEX', 'INDEXES', 'INFILE', 'INNER', 'INNER JOIN', 'INSERT', 'INSERT_ID', 'INSERT_METHOD', 'INTERVAL', - 'INTO', 'INVOKER', 'IS', 'ISOLATION', 'JOIN', 'KEY', 'KEYS', 'KILL', 'LAST_INSERT_ID', 'LEADING', 'LEFT', 'LEFT JOIN', 'LEVEL', 'LIKE', 'LIMIT', 'LINEAR', + 'EXISTS', 'EXPLAIN', 'EXTENDED', 'FAST', 'FIELDS', 'FILE', 'FIRST', 'FIXED', 'FLUSH', 'FOR', 'FORCE', 'FOREIGN', 'FULL', 'FULLTEXT', + 'FUNCTION', 'GEMINI', 'GEMINI_SPIN_RETRIES', 'GLOBAL', 'GRANT', 'GRANTS', 'GROUP_CONCAT', 'HEAP', 'HIGH_PRIORITY', 'HOSTS', 'HOUR', 'HOUR_MINUTE', + 'HOUR_SECOND', 'IDENTIFIED', 'IF', 'IGNORE', 'IN', 'INDEX', 'INDEXES', 'INFILE', 'INSERT', 'INSERT_ID', 'INSERT_METHOD', 'INTERVAL', + 'INTO', 'INVOKER', 'IS', 'ISOLATION', 'KEY', 'KEYS', 'KILL', 'LAST_INSERT_ID', 'LEADING', 'LEVEL', 'LIKE', 'LINEAR', 'LINES', 'LOAD', 'LOCAL', 'LOCK', 'LOCKS', 'LOGS', 'LOW_PRIORITY', 'MARIA', 'MASTER', 'MASTER_CONNECT_RETRY', 'MASTER_HOST', 'MASTER_LOG_FILE', 'MASTER_LOG_POS', 'MASTER_PASSWORD', 'MASTER_PORT', 'MASTER_USER', 'MATCH', 'MAX_CONNECTIONS_PER_HOUR', 'MAX_QUERIES_PER_HOUR', 'MAX_ROWS', 'MAX_UPDATES_PER_HOUR', 'MAX_USER_CONNECTIONS', 'MEDIUM', 'MERGE', 'MINUTE', 'MINUTE_SECOND', 'MIN_ROWS', 'MODE', 'MODIFY', 'MONTH', 'MRG_MYISAM', 'MYISAM', 'NAMES', 'NATURAL', 'NOT', 'NOW', 'NULL', 'OFFSET', 'ON', 'OPEN', 'OPTIMIZE', 'OPTION', 'OPTIONALLY', 'OR', - 'ORDER', 'ORDER BY', 'OUTER', 'OUTER JOIN', 'OUTFILE', 'PACK_KEYS', 'PAGE', 'PARTIAL', 'PARTITION', 'PARTITIONS', 'PASSWORD', 'PRIMARY', 'PRIVILEGES', 'PROCEDURE', + 'OUTFILE', 'PACK_KEYS', 'PAGE', 'PARTIAL', 'PARTITION', 'PARTITIONS', 'PASSWORD', 'PRIMARY', 'PRIVILEGES', 'PROCEDURE', 'PROCESS', 'PROCESSLIST', 'PURGE', 'QUICK', 'RAID0', 'RAID_CHUNKS', 'RAID_CHUNKSIZE', 'RAID_TYPE', 'RANGE', 'READ', 'READ_ONLY', 'READ_WRITE', 'REFERENCES', 'REGEXP', 'RELOAD', 'RENAME', 'REPAIR', 'REPEATABLE', 'REPLACE', 'REPLICATION', 'RESET', 'RESTORE', 'RESTRICT', - 'RETURN', 'RETURNS', 'REVOKE', 'RIGHT', 'RIGHT JOIN', 'RLIKE', 'ROLLBACK', 'ROW', 'ROWS', 'ROW_FORMAT', 'SECOND', 'SECURITY', 'SELECT', 'SEPARATOR', + 'RETURN', 'RETURNS', 'REVOKE', 'RLIKE', 'ROLLBACK', 'ROW', 'ROWS', 'ROW_FORMAT', 'SECOND', 'SECURITY', 'SEPARATOR', 'SERIALIZABLE', 'SESSION', 'SET', 'SHARE', 'SHOW', 'SHUTDOWN', 'SLAVE', 'SONAME', 'SOUNDS', 'SQL', 'SQL_AUTO_IS_NULL', 'SQL_BIG_RESULT', 'SQL_BIG_SELECTS', 'SQL_BIG_TABLES', 'SQL_BUFFER_RESULT', 'SQL_CACHE', 'SQL_CALC_FOUND_ROWS', 'SQL_LOG_BIN', 'SQL_LOG_OFF', 'SQL_LOG_UPDATE', 'SQL_LOW_PRIORITY_UPDATES', 'SQL_MAX_JOIN_SIZE', 'SQL_NO_CACHE', 'SQL_QUOTE_SHOW_CREATE', 'SQL_SAFE_UPDATES', 'SQL_SELECT_LIMIT', 'SQL_SLAVE_SKIP_COUNTER', 'SQL_SMALL_RESULT', 'SQL_WARNINGS', 'START', 'STARTING', 'STATUS', 'STOP', 'STORAGE', 'STRAIGHT_JOIN', 'STRING', 'STRIPED', 'SUPER', 'TABLE', 'TABLES', 'TEMPORARY', 'TERMINATED', 'THEN', 'TO', 'TRAILING', 'TRANSACTIONAL', - 'TRUNCATE', 'TYPE', 'TYPES', 'UNCOMMITTED', 'UNION', 'UNIQUE', 'UNLOCK', 'UPDATE', 'USAGE', 'USE', 'USING', 'VALUES', 'VARIABLES', - 'VIEW', 'WHEN', 'WHERE', 'WITH', 'WORK', 'WRITE', 'XOR', 'YEAR_MONTH' + 'TRUNCATE', 'TYPE', 'TYPES', 'UNCOMMITTED', 'UNION', 'UNIQUE', 'UNLOCK', 'USAGE', 'USE', 'USING', 'VARIABLES', + 'VIEW', 'WHEN', 'WITH', 'WORK', 'WRITE', 'XOR', 'YEAR_MONTH' ); // For SQL formatting @@ -72,9 +73,11 @@ class SqlFormatter // This flag tells us if SqlFormatted has been initialized protected static $init; - - // This is a combination of all the boundary characters and all the whitespace characters - protected static $all_boundaries; + + // Regular expressions for tokenizing + protected static $regex_boundaries; + protected static $regex_reserved; + protected static $regex_special_reserved; //cache variables //Only tokens shorter than this size will be cached. Somewhere between 10 and 20 seems to work well for most cases. @@ -96,6 +99,20 @@ class SqlFormatter ); } + protected static function init() { + if(self::$init) return; + + //Sort reserved word list from longest word to shortest + usort(self::$reserved, array('SqlFormatter', 'sortLength')); + + //set up regular expressions + self::$regex_boundaries = '('.implode('|',array_map(array('SqlFormatter', 'quote_regex'),self::$boundaries)).')'; + self::$regex_reserved = '('.implode('|',array_map(array('SqlFormatter', 'quote_regex'),self::$reserved)).')'; + self::$regex_special_reserved = '('.implode('|',array_map(array('SqlFormatter', 'quote_regex'),self::$special_reserved)).')'; + + self::$init = true; + } + /** * Return the next token and token type in a SQL string. * Quoted strings, comments, reserved words, whitespace, and punctuation are all their own tokens. @@ -107,7 +124,15 @@ class SqlFormatter */ protected static function getNextToken($string, $previous = null) { - // If the next token is a comment + // Whitespace + if (preg_match('/^\s+/',$string,$matches)) { + return array( + 'token' => $matches[0], + 'type'=>'whitespace' + ); + } + + // Comment if ($string[0] === '#' || substr($string, 0, 2) === '--' || substr($string, 0, 2) === '/*') { // Comment until end of line if ($string[0] === '-' || $string[0] === '#') { @@ -128,158 +153,99 @@ class SqlFormatter ); } - // If the next item is a string - if (in_array($string[0], self::$quotes)) { - $quote = $string[0]; - for ($i = 1, $length = strlen($string); $i < $length; $i++) { - $next_char = null; - if (isset($string[$i + 1])) { - $next_char = $string[$i + 1]; - } - - // Escaped (either backslash or backtick escaped) - if (($quote !== '`' && $string[$i] === '\\') || ($quote === '`' && $string[$i] === '`' && $next_char === '`')) { - $i++; - } elseif ($string[$i] === $quote) { - break; - } - } - if ($quote === '`') { - $type = 'backtick quote'; - } else { - $type = 'quote'; - } - + // Double Quoted String + if( preg_match('/^("(?:[^"\\\\]|\\\\.)*")/', $string, $matches)) { return array( - 'token' => substr($string, 0, $i + 1), - 'type' => $type + 'token'=>$matches[1], + 'type'=>'quote' ); } - - // Separators - if (in_array($string[0], self::$boundaries)) { - // If it is a simple string or empty between the parentheses, just count as a word - // this makes it so we don't split things like NOW() or COUNT(*) into separate lines - if ($string[0] === '(') { - // "()" - if (isset($string[1]) && $string[1] === ')') { - return array( - 'token' => '()', - 'type' => 'word' - ); - } - - // "(word/whitespace/boundary)" - $next_token = self::getNextToken(substr($string, 1)); - $length = strlen($next_token['token']); - if (isset($string[$length + 1]) && $string[$length + 1] === ')') { - if ($next_token['type'] === 'word' || $next_token['type'] === 'whitespace' || $next_token['type'] === 'boundary') { - return array( - 'token' => '(' . $next_token['token'] . ')', - 'type' => 'word' - ); - } - } - } - - //return single parentheses as their own token - if ($string[0] === '(' || $string[0] === ')') { + + // Single Quoted String + elseif( preg_match('/^(\'(?:[^\'\\\\]|\\\\.)*\')/', $string, $matches)) { + return array( + 'token'=>$matches[1], + 'type'=>'quote' + ); + } + + // Backtick Quoted String + elseif( preg_match('/^(`(``|[^`])+`)/', $string, $matches)) { + return array( + 'token'=>$matches[1], + 'type'=>'backtick quote' + ); + } + + // Number followed by boundary + if(preg_match('/^([0-9]+(\.[0-9]+)?)($|\s|'.self::$regex_boundaries.')/',$string,$matches)) { + return array( + 'token' => $matches[1], + 'type'=>'number' + ); + } + + // Open Parentheses + if ($string[0] === '(') { + // Parentheses with something simple between them + // Examples: "()", "( word )", "(*)" + // If it is anything more complex, it will be split into multiple lines and indented + if(preg_match('/^(\( *([a-zA-Z\*][a-zA-Z_\-0-9\*]*)? *\))/',$string,$matches)) { return array( - 'token' => $string[0], - 'type' => $string[0] + 'token'=>$matches[1], + 'type'=>'word' ); } - - - // If there are 1 or more boundary characters together, return as a single word - $next_token = self::getNextToken(substr($string, 1)); - if ($next_token['type'] === 'boundary') { + //otherwise, just return the parentheses by itself + else { return array( - 'token' => $string[0].$next_token['token'], - 'type' => 'boundary' + 'token'=>'(', + 'type'=>'(' ); } - - // Otherwise, just return the single boundary character - if ($string[0] === '.' || $string[0] === ',') { - $type = $string[0]; - } else { - $type = 'boundary'; - } - + } + + // Special boundary characters (e.g. ")", ",", ".", ";") + if($string[0] === ')' || $string[0] === ',' || $string[0] === '.' || $string[0] === ';') { return array( - 'token' => $string[0], - 'type' => $type + 'token'=>$string[0], + 'type'=>$string[0] ); } - // Whitespace - if (in_array($string[0], self::$whitespace)) { - for ($i = 1, $length = strlen($string); $i < $length; $i++) { - if (!in_array($string[$i], self::$whitespace)) { - break; - } - } - + // Other Boundary Characters + if(preg_match('/^('.self::$regex_boundaries.'+)/',$string,$matches)) { return array( - 'token' => substr($string, 0, $i), - 'type' => 'whitespace' + 'token' => $matches[1], + 'type' => 'boundary' ); } - if (!self::$init) { - //Sort reserved word list from longest word to shortest - usort(self::$reserved, array('SqlFormatter', 'sortLength')); - - //Combine boundary characters and whitespace - self::$all_boundaries = array_merge(self::$boundaries, self::$whitespace); - - self::$init = true; - } - - //a reserved word cannot be preceded by a '.' - //this makes it so in "mytable.from", "from" is not considered a reserved word + // A reserved word cannot be preceded by a '.' + // this makes it so in "mytable.from", "from" is not considered a reserved word if (!$previous || !isset($previous['token']) || $previous['token'] !== '.') { - // Reserved word - $test = strtoupper($string); - foreach (self::$reserved as $word) { - $length = strlen($word); - if (substr($test, 0, $length) === $word) { - if (isset($string[$length]) && !in_array($string[$length], self::$all_boundaries)) { - continue; - } - - if (in_array($word, self::$special_reserved)) { - $type = 'special reserved'; - } else { - $type = 'reserved'; - } - - return array( - 'token' => substr($string, 0, $length), - 'type' => $type - ); - } + // Special Reserved Word + if(preg_match('/^('.self::$regex_special_reserved.')($|\s|'.self::$regex_boundaries.')/', strtoupper($string),$matches)) { + return array( + 'type'=>'special reserved', + 'token'=>substr($string,0,strlen($matches[1])) + ); } - } - - // Look for first word separator - for ($i = 1, $length = strlen($string); $i < $length; $i++) { - if (in_array($string[$i], self::$all_boundaries)) { - break; + // Other Reserved Word + elseif(preg_match('/^('.self::$regex_reserved.')($|\s|'.self::$regex_boundaries.')/', strtoupper($string),$matches)) { + return array( + 'type'=>'reserved', + 'token'=>substr($string,0,strlen($matches[1])) + ); } } - $ret = substr($string, 0, $i); - if (is_numeric($ret)) { - $type = 'number'; - } else { - $type = 'word'; - } + // Non reserved word + preg_match('/^(.*?)($|\s|'.self::$regex_boundaries.')/',$string,$matches); return array( - 'token' => $ret, - 'type' => $type + 'token' => $matches[1], + 'type' => 'word' ); } @@ -295,6 +261,8 @@ class SqlFormatter */ protected static function tokenize($string) { + self::init(); + $tokens = array(); //used for debugging if there is an error while tokenizing the string @@ -332,7 +300,7 @@ class SqlFormatter } else { // Get the next token and the token type - $token = self::getNextToken($string, $token); + $token = self::getNextToken($string, $token); $token_length = strlen($token['token']); self::$cache_misses++; @@ -592,6 +560,7 @@ class SqlFormatter case 'boundary': case '.': case ',': + case ';': return self::highlightBoundary($token); case 'comment': case 'block comment': @@ -702,4 +671,9 @@ class SqlFormatter { return strlen($b) - strlen($a); } + + private static function quote_regex($a) { + //don't escape numbers or letters + return preg_replace('/([\,\;\)\(\.\=\<\>\+\-\*\/\!\^\\\%\|\&\[\]\?])/','\\\\$0',$a); + } } |