[Subversion] View of /SCALE/scale/dsl.py

View of /SCALE/scale/dsl.py

Parent Directory

Revision Log

Revision: 2086 - (download) (as text)
Sun Sep 4 07:06:35 2005 UTC (18 years, 7 months ago) by pje
File size: 6889 byte(s)

Created a new project for SCALE - the Syntax for Configuration
And Language Extensions.  So far, this just includes a nifty
generic parser for "Python-like" languages.  One of the examples
that shows the power of the parser is a 7-line Python code 
reformatter that can reindent Python code without changing its
semantics, properly handling comments, multi-line strings,
continued lines, and much much more.  The actual SCALE mini-
language will be implemented atop the generic parser, and it
should be pretty easy to create other domain-specific languages
(like parser generators and other code generators) atop it as
well.

"""Parsing tools for SCALE and other Python-like Domain-Specific Languages""" __all__ = [ "tokenize_string", "tokenize_stream", "tokenize_file", "detokenize", "parse_block", "flatten_block", "strip_ws", "TokenError", ] from StringIO import StringIO from tokenize import generate_tokens, NL, NEWLINE, ENDMARKER, INDENT, DEDENT from tokenize import COMMENT, NAME, NUMBER, STRING, ERRORTOKEN, OP, TokenError import re WHITESPACE = dict.fromkeys([NL,NEWLINE,ENDMARKER,INDENT,DEDENT,COMMENT]) OPEN_PARENS = dict.fromkeys('{[(') CLOSE_PARENS = {'}':'{', ')':'(', ']':'['} BOM = '\xef\xbb\xbf' ENCODING_LINE = re.compile('('+BOM+')|\s*(#.*)?$').match FIND_ENCODING = re.compile('coding[:=]\s*([-\w.]+)').search def tokenize_string(text): """Yield the tokens of `text` `text` may be a string or unicode object. If it's a string, PEP 263 source encoding comments/BOM markers will be recognized. """ return tokenize_stream(StringIO(text)) def tokenize_file(filename): """Yield the tokens contained in the file named by `filename` PEP 263 source encoding comments and BOM markers will be recognized. """ return tokenize_stream(open(filename,'rU')) def tokenize_stream(file): """Yield the tokens contained in an iterable of lines This function assumes only that `file` is an iterable yielding string or Unicode objects. If the first two lines are strings, they're checked for PEP 263 source encoding comments, and the first is checked for a UTF-8 BOM marker. If an encoding or BOM is found, the remainder of the stream is decoded as unicode. (Lines prior to the declaration line are not decoded.) """ def reader(f): lno, encoding, bom = 1, None, False for line in f: if isinstance(line, unicode) or not ENCODING_LINE(line): yield line break if lno==1 and line.startswith(BOM): bom = True encoding = "utf8" line = line[len(BOM):].decode(encoding) if line.find('coding')>=0: match = FIND_ENCODING(line) if match: encoding = match.group(1) if bom and encoding.lower() not in ('utf8', 'utf-8'): raise TokenError( ("UTF-8 BOM, but %r encoding requested" %encoding), (lno,match.start(1)) ) yield line lno += 1 if encoding or lno>2: break if encoding: from codecs import getreader f = getreader(encoding)(f) for line in f: yield line yield '' return generate_tokens(reader(file).next) def parse_block(tokens): """Turn an iterable of tokens into a tree of statements Returns a `block`, where a block is a list of (`statement`,`block`) tuples. Each `statement` is a list of the tokens composing a logical source line, and each `block` represents the block indented beneath `statement`, if any. Each nested `block` is in turn a list of (`statement`,`block`) tuples. If a given statement has no nested block beneath it, its `block` is an empty list. """ scopes = [] parens = [] indents = [0] output = scope = [] stmt = [] for tok, val, start, end, line in tokens: if tok==DEDENT or tok==ENDMARKER: # exiting a scope if tok==DEDENT: if len(line[:start[1]].expandtabs()) not in indents: raise TokenError( "unindent does not match any outer indentation level", start ) indents.pop() elif parens: raise TokenError( "Unclosed parentheses %r" % (tuple(parens[::-1]),), start ) if stmt: scope.append((stmt,[])) stmt = [] if scopes: assert tok != ENDMARKER, "Missing DEDENT tokens" scope = scopes.pop() else: assert tok == ENDMARKER, "Extra DEDENT tokens" return output elif tok==INDENT: if not scope: # no statement that this indent is under! raise TokenError("Unexpected indent", start) scopes.append(scope) indents.append(len(val.expandtabs())) scope = scope[-1][1] # fill in block under statement else: stmt.append((tok, val, start, end, line)) if tok==NEWLINE: scope.append((stmt,[])) stmt = [] elif tok==OP: if val in OPEN_PARENS: parens.append(val) elif val in CLOSE_PARENS: if not parens or parens.pop()<>CLOSE_PARENS[val]: raise TokenError("Unmatched "+val, start) assert False, "Token stream didn't have an ENDMARKER" def flatten_block(block): """Yield the tokens composing `block`""" for stmt,subblock in block: for tok in stmt: yield tok if subblock: for tok in flatten_block(subblock): yield tok def strip_ws(tokens): """Yield non-whitespace tokens from `tokens`""" for tok in tokens: if tok[0] not in WHITESPACE: yield tok def detokenize(tokens, indent=0): """Convert `tokens` iterable back to a string.""" out = []; add = out.append lr,lc,last = 0,0,'' baseindent = None for tok, val, (sr,sc), (er,ec), line in tokens: # Insert trailing line continuation and blanks for skipped lines lr = lr or sr # first line of input is first line of output if sr>lr: if last: if len(last)>lc: add(last[lc:]) lr+=1 if sr>lr: add(' '*indent + '\\\n'*(sr-lr)) # blank continuation lines lc = 0 # Re-indent first token on line if lc==0: if tok==INDENT: continue # we want to dedent first actual token else: curindent = len(line[:sc].expandtabs()) if baseindent is None and tok not in WHITESPACE: baseindent = curindent elif baseindent is not None and curindent>=baseindent: add(' ' * (curindent-baseindent)) if indent and tok not in (DEDENT, ENDMARKER, NL, NEWLINE): add(' ' * indent) # Not at start of line, handle intraline whitespace by retaining it elif sc>lc: add(line[lc:sc]) if val: add(val) lr,lc,last = er,ec,line return ''.join(out)

View of /SCALE/scale/dsl.py

ViewCVS and CVS Help