[Subversion] / SCALE / scale / dsl.py  

View of /SCALE/scale/dsl.py

Parent Directory | Revision Log
Revision: 2086 - (download) (as text)
Sun Sep 4 07:06:35 2005 UTC (18 years, 7 months ago) by pje
File size: 6889 byte(s)
Created a new project for SCALE - the Syntax for Configuration
And Language Extensions.  So far, this just includes a nifty
generic parser for "Python-like" languages.  One of the examples
that shows the power of the parser is a 7-line Python code 
reformatter that can reindent Python code without changing its
semantics, properly handling comments, multi-line strings,
continued lines, and much much more.  The actual SCALE mini-
language will be implemented atop the generic parser, and it
should be pretty easy to create other domain-specific languages
(like parser generators and other code generators) atop it as
well.
"""Parsing tools for SCALE and other Python-like Domain-Specific Languages"""

__all__ = [
    "tokenize_string", "tokenize_stream", "tokenize_file", "detokenize",
    "parse_block", "flatten_block", "strip_ws", "TokenError",
]

from StringIO import StringIO
from tokenize import generate_tokens, NL, NEWLINE, ENDMARKER, INDENT, DEDENT
from tokenize import COMMENT, NAME, NUMBER, STRING, ERRORTOKEN, OP, TokenError
import re

WHITESPACE = dict.fromkeys([NL,NEWLINE,ENDMARKER,INDENT,DEDENT,COMMENT])
OPEN_PARENS = dict.fromkeys('{[(')
CLOSE_PARENS = {'}':'{', ')':'(', ']':'['}

BOM = '\xef\xbb\xbf'
ENCODING_LINE = re.compile('('+BOM+')|\s*(#.*)?$').match
FIND_ENCODING = re.compile('coding[:=]\s*([-\w.]+)').search

def tokenize_string(text):
    """Yield the tokens of `text`

    `text` may be a string or unicode object.  If it's a string, PEP 263
    source encoding comments/BOM markers will be recognized.
    """
    return tokenize_stream(StringIO(text))

def tokenize_file(filename):
    """Yield the tokens contained in the file named by `filename`

    PEP 263 source encoding comments and BOM markers will be recognized.
    """
    return tokenize_stream(open(filename,'rU'))







def tokenize_stream(file):
    """Yield the tokens contained in an iterable of lines

    This function assumes only that `file` is an iterable yielding string or
    Unicode objects.  If the first two lines are strings, they're checked for
    PEP 263 source encoding comments, and the first is checked for a UTF-8 BOM
    marker.  If an encoding or BOM is found, the remainder of the stream is
    decoded as unicode.  (Lines prior to the declaration line are not decoded.)
    """
    def reader(f):
        lno, encoding, bom = 1, None, False
        for line in f:
            if isinstance(line, unicode) or not ENCODING_LINE(line):
                yield line  
                break
            if lno==1 and line.startswith(BOM):
                bom = True
                encoding = "utf8"
                line = line[len(BOM):].decode(encoding)
            if line.find('coding')>=0:
                match = FIND_ENCODING(line)
                if match:
                    encoding = match.group(1)
                    if bom and encoding.lower() not in ('utf8', 'utf-8'):
                        raise TokenError(
                            ("UTF-8 BOM, but %r encoding requested" %encoding),
                            (lno,match.start(1))
                        )
            yield line
            lno += 1
            if encoding or lno>2:
                break
        if encoding:
            from codecs import getreader
            f = getreader(encoding)(f)

        for line in f:
            yield line
        yield ''
    return generate_tokens(reader(file).next)

def parse_block(tokens):
    """Turn an iterable of tokens into a tree of statements

    Returns a `block`, where a block is a list of (`statement`,`block`) tuples.
    Each `statement` is a list of the tokens composing a logical source line,
    and each `block` represents the block indented beneath `statement`, if any.
    Each nested `block` is in turn a list of (`statement`,`block`) tuples.  If
    a given statement has no nested block beneath it, its `block` is an empty
    list.
    """
    scopes = []
    parens = []
    indents = [0]
    output = scope = []
    stmt = []

    for tok, val, start, end, line in tokens:

        if tok==DEDENT or tok==ENDMARKER:   # exiting a scope
            if tok==DEDENT:
                if len(line[:start[1]].expandtabs()) not in indents:
                    raise TokenError(
                        "unindent does not match any outer indentation level",
                        start
                    )
                indents.pop()
            elif parens:
                raise TokenError(
                    "Unclosed parentheses %r" % (tuple(parens[::-1]),), start
                )

            if stmt:
                scope.append((stmt,[]))
                stmt = []
            if scopes:
                assert tok != ENDMARKER, "Missing DEDENT tokens"
                scope = scopes.pop()
            else:
                assert tok == ENDMARKER, "Extra DEDENT tokens"
                return output

        elif tok==INDENT:
            if not scope:   # no statement that this indent is under!
                raise TokenError("Unexpected indent", start)
            scopes.append(scope)
            indents.append(len(val.expandtabs()))
            scope = scope[-1][1]    # fill in block under statement

        else:
            stmt.append((tok, val, start, end, line))

            if tok==NEWLINE:
                scope.append((stmt,[]))
                stmt = []

            elif tok==OP:
                if val in OPEN_PARENS:
                    parens.append(val)
                elif val in CLOSE_PARENS:
                    if not parens or parens.pop()<>CLOSE_PARENS[val]:
                        raise TokenError("Unmatched "+val, start)

    assert False, "Token stream didn't have an ENDMARKER"


def flatten_block(block):
    """Yield the tokens composing `block`"""
    for stmt,subblock in block:
        for tok in stmt:
            yield tok
        if subblock:
            for tok in flatten_block(subblock):
                yield tok

def strip_ws(tokens):
    """Yield non-whitespace tokens from `tokens`"""
    for tok in tokens:
        if tok[0] not in WHITESPACE:
            yield tok



def detokenize(tokens, indent=0):
    """Convert `tokens` iterable back to a string."""
    out = []; add = out.append
    lr,lc,last = 0,0,''
    baseindent = None
    for tok, val, (sr,sc), (er,ec), line in tokens:
        # Insert trailing line continuation and blanks for skipped lines
        lr = lr or sr   # first line of input is first line of output
        if sr>lr:
            if last:
                if len(last)>lc:
                    add(last[lc:])
                lr+=1
            if sr>lr:
                add(' '*indent + '\\\n'*(sr-lr))    # blank continuation lines
            lc = 0

        # Re-indent first token on line
        if lc==0:
            if tok==INDENT:
                continue  # we want to dedent first actual token
            else:
                curindent = len(line[:sc].expandtabs())
                if baseindent is None and tok not in WHITESPACE:
                    baseindent = curindent
                elif baseindent is not None and curindent>=baseindent:
                    add(' ' * (curindent-baseindent))
                if indent and tok not in (DEDENT, ENDMARKER, NL, NEWLINE):
                    add(' ' * indent)

        # Not at start of line, handle intraline whitespace by retaining it
        elif sc>lc:
            add(line[lc:sc])

        if val:
            add(val)

        lr,lc,last = er,ec,line

    return ''.join(out)


cvs-admin@eby-sarna.com

Powered by ViewCVS 1.0-dev

ViewCVS and CVS Help