"""
SQL SELECT clause parser
~~~~~~~~~~~~~~~~~~~~~~~
Creates a dictionary of keyword: SQL-text for each SELECT statement clause.
Code is mindful of quoting & respects proper order of keywords.
Current version does not support subqueries.
"""
import re
import sys
import tokenize
if sys.version_info.major < 3:
from StringIO import StringIO
else:
from io import StringIO
SQL_BY_KWD = re.compile(r'^BY\s+')
SELECT_KWDS = ['SELECT', 'FROM', 'WHERE', 'GROUP', 'ORDER', 'LIMIT']
def get_sql_clauses(values):
start = 0
for kwd in SELECT_KWDS:
if kwd in values[start:]:
start = values[start:].index(kwd) + start
yield start
else:
yield None
def remove_sql_by_kwd(sql):
for kwd in ['GROUP', 'ORDER']:
if kwd in sql:
sql[kwd] = SQL_BY_KWD.sub(sql[kwd], '')
return sql
def get_tokens(cmd):
"""
>>> get_tokens('SELECT a, b, c FROM myfile1.csv, myfile2.csv ORDER BY b')
{'FROM': 'myfile1.csv, myfile2.csv', 'ORDER': 'b', 'SELECT': 'a, b, c'}
"""
# token = (type, value, (start_row, start_col,), (end_row, end_col,), line)
toks = list(tokenize.generate_tokens(StringIO(cmd).readline))
kwds = zip(SELECT_KWDS, get_sql_clauses([i[1] for i in toks]))
kwds = [i for i in kwds if i[1] is not None]
if kwds:
sql = dict([(j[0], cmd[toks[j[1]][3][1] + 1:toks[kwds[i + 1][1]][2][1] - 1],)
for i, j in enumerate(kwds[:-1])])
sql[kwds[-1][0]] = cmd[toks[kwds[-1][1]][3][1] + 1:]
return remove_sql_by_kwd(sql)
if __name__ == '__main__':
import doctest
doctest.testmod()