2 years ago
22 kB
#!/usr/bin/env python # Implementation of the LOLPython language. # Converts from LOLPython to Python then optionally runs the Python. # This package depends on PLY -- # Written by Andrew Dalke <> # Dalke Scientific Software, LLC # 1 June 2007, Gothenburg, Sweden # # This software is in the public domain. For details see: # import sys import keyword import os import types from cStringIO import StringIO from ply import * __NAME__ = "lolpython" __VERSION__ = "1.0" # Translating LOLPython tokens to Python tokens # This could be cleaned up. For example, some of # these tokens could be merged into one. tokens = ( "NAME", # variable names "RESERVED", # Used for Python reserved names "NUMBER", # Integers and floats "STRING", "OP", # Like the Python OP "CLOSE", # Don't really need this.. "COMMENT", "AUTOCALL", # write t.value then add '(' "INLINE", # write t.value directly "FUTURE", # for the "I FUTURE CAT WITH" statement "PRINT", # VISIBLE -> stdout or COMPLAIN -> stderr "ENDMARKER", "COLON", "WS", "NEWLINE", ) # Helper functions for making given token types def OP(t, value): t.type = "OP" t.value = value return t def RESERVED(t, value): t.type = "RESERVED" t.value = value return t def AUTOCALL(t, value): t.type = "AUTOCALL" t.value = "tuple" t.lexer.paren_stack.append(")") return t def INLINE(t, value): t.type = "INLINE" t.value = value return t ##### # ply uses a large regex for token detection, and sre is limited to # 100 groups. This grammar pushes the limit. I use (?:non-grouping) # parens to keep the count down. def t_ASSIGN(t): # cannot be a simple pattern because it must r'CAN[ ]+HA[SZ]\b' # come before the t_NAME definition return OP(t, "=") def t_SINGLE_QUOTE_STRING(t): r"'([^\\']+|\\'|\\\\)*'" # I think this is right ... t.type = "STRING" t.value = t.value[1:-1].decode("string-escape") return t def t_DOUBLE_QUOTE_STRING(t): r'"([^\\"]+|\\"|\\\\)*"' t.type = "STRING" t.value = t.value[1:-1].decode("string-escape") return t # and LOL quoted strings! They end with /LOL # No way to have "/LOL" in the string. def t_LOL_STRING(t): r"LOL[ ]*((?!/LOL).|\n)*[ ]*/LOL" t.type = "STRING" t.value = t.value[3:-4].strip(" ") return t # Aliases for the same thing - for extra cuteness def t_LSQUARE(t): r"(?:SOME|LOOK[ ]AT|LET[ ]+THE)\b" t.lexer.paren_stack.append(']') return OP(t, "[") def t_LPAREN(t): r"(?:WIT|THEZ)\b" t.lexer.paren_stack.append(')') return OP(t, "(") def t_LBRACE(t): r"BUCKET\b" t.lexer.paren_stack.append("}") return OP(t, "{") def t_CLOSE(t): r"(?:OK(!+|\b)|!+)" stack = t.lexer.paren_stack if t.value.startswith("OK"): num_closes = len(t.value)-1 # OK -> 1, OK! -> 2, OK!!->3 else: num_closes = len(t.value) # ! -> 1, !! -> 2 # Which close is this? I use "OK" to match (, [ and { if len(stack) < num_closes: raise AssertionError("not enough opens on the stack: line %d" % (t.lineno,)) t.value = "".join(stack[-num_closes:][::-1]) del stack[-num_closes:] return t def t_EQ(t): r"KINDA[ ]+LIKE\b" return OP(t, "==") def t_NE(t): r"(?:KINDA[ ]+)?NOT[ ]+LIKE\b" return OP(t, "!=") def t_is(t): r"KINDA[ ]+IS\b" return RESERVED(t, "is") def t_GT(t): r"ATE[ ]+MORE[ ]+CHEEZBURGERS?[ ]+THAN\b" return OP(t, ">") def t_LT(t): r"ATE[ ]+FEWER[ ]+CHEEZBURGERS?[ ]+THAN\b" return OP(t, "<") def t_GTE(t): r"BIG[ ]+LIKE\b" return OP(t, ">=") def t_LTE(t): r"SMALL[ ]+LIKE\b" return OP(t, "<=") def t_RETURN(t): r"U[ ]+TAKE\b" return RESERVED(t, "return") def t_yield(t): r"U[ ]+BORROW\b" return RESERVED(t, "yield") def t_ELIF(t): r"OR[ ]+IZ\b" return RESERVED(t, "elif") def t_ELSE(t): r"(?:(?:I[ ]+GIVE[ ]+UP|IZ[ ]+KEWL|ALL[ ]+DONE)|NOPE)\b" return RESERVED(t, "else") def t_COLON(t): r"\?" t.value = ":" return t def t_FROM(t): r"IN[ ]+MAI\b" return RESERVED(t, "from") def t_EXCEPT(t): r"O[ ]+NOES\b" return RESERVED(t, "except") def t_PLUS(t): r"ALONG[ ]+WITH\b" return OP(t, "+") def t_MINUS(t): r"TAKE[ ]+AWAY\b" return OP(t, "-") def t_PLUS_EQUAL(t): r"GETZ[ ]+ANOTHR\b" return OP(t, "+=") def t_MINUS_EQUAL(t): r"THROW[SZ]?[ ]+AWAY\b" return OP(t, "-=") def t_DIV(t): r"SMASHES[ ]+INTO\b" return OP(t, "/") def t_DIV_EQUAL(t): r"SMASHES[ ]+INTO[ ]+HAS\b" return OP(t, "/=") def t_TRUEDIV(t): r"SMASHES[ ]+NICELY[ ]+INTO\b" return OP(t, "//") def t_MUL(t): r"OF[ ]THOSE\b" return OP(t, "*") def t_MUL_EQUAL(t): r"COPIES[ ]+(?:HIM|HER|IT)SELF[ ]+BY\b" return OP(t, "*=") def t_POW(t): r"BY[ ]+GRAYSKULL[ ]+POWER" return OP(t, "**") def t_IN(t): r"IN[ ]+(?:UR|THE|THIS)\b" return OP(t, "in") def t_del(t): r"DO[ ]+NOT[ ]+WANT\b" return RESERVED(t, "del") def t_and(t): r"\&" return RESERVED(t, "and") def t_or(t): r"OR[ ]+MABEE\b" return RESERVED(t, "or") def t_pass(t): r"I[ ]+IZ[ ]+CUTE\b" return RESERVED(t, "pass") def t_forever(t): r"WHILE[ ]+I[ ]+CUTE\b" return INLINE(t, "while 1") def t_def(t): r"SO[ ]+IM[ ]+LIKE\b" return RESERVED(t, "def") def t_class(t): r"ME[ ]+MAKE[ ]\b" return RESERVED(t, "class") def t_future(t): r"I[ ]+FUTURE[ ]+CAT[ ]+WITH\b" t.type = "FUTURE" return t def t_assert(t): r"SO[ ]+GOOD\b" return RESERVED(t, "assert") def t_assert_not(t): r"AINT[ ]+GOOD\b" return INLINE(t, "assert not ") def t_for(t): r"GIMME[ ]+EACH\b" return RESERVED(t, "for") def t_list(t): r"ALL[ ]+OF\b" return AUTOCALL(t, "tuple") RESERVED_VALUES = { "EASTERBUNNY": ("NUMBER", "0"), "CHEEZBURGER": ("NUMBER", "1"), "CHOKOLET": ("NUMBER", "-1"), "TWIN": ("NUMBER", "2"), "TWINZ": ("NUMBER", "2"), "TWINS": ("NUMBER", "2"), "EVILTWIN": ("NUMBER", "-2"), "EVILTWINZ": ("NUMBER", "-2"), "EVILTWINS": ("NUMBER", "-2"), "ALLFINGERZ": ("NUMBER", "10"), "TOEZ": ("NUMBER", "-10"), "ONE": ("NUMBER", "1"), "ONCE": ("NUMBER", "1"), "TWO": ("NUMBER", "2"), "TWICE": ("NUMBER", "2"), "THR33": ("NUMBER", "3"), "FOUR": ("NUMBER", "4"), "FIV": ("NUMBER", "5"), "SIKS": ("NUMBER", "6"), "SEVN": ("NUMBER", "7"), "ATE": ("NUMBER", "8"), "NINE": ("NUMBER", "9"), "MEH": ("NAME", "False"), "YEAH": ("NAME", "True"), "VISIBLE": ("PRINT", "stdout"), "COMPLAIN": ("PRINT", "stderr"), "AND": ("OP", ","), "BLACKHOLE": ("RESERVED", "ZeroDivisionError"), "DONOTLIKE": ("AUTOCALL", "AssertionError"), "ANTI": ("OP", "-"), "IZ": ("RESERVED", "if"), "GIMME": ("RESERVED", "import"), "LIKE": ("RESERVED", "as"), "OWN": ("OP", "."), "PLZ": ("RESERVED", "try"), "HALP": ("RESERVED", "raise"), "WHATEVER": ("RESERVED", "finally"), "KTHX": ("RESERVED", "continue"), "KTHXBYE": ("RESERVED", "break"), "OVER": ("OP", "/"), "AINT": ("RESERVED", "not"), "ME": ("RESERVED", "self"), "STRING": ("AUTOCALL", "str"), "NUMBR": ("AUTOCALL", "int"), "BIGNESS": ("AUTOCALL", "len"), "NUMBRZ": ("AUTOCALL", "range"), "ADDED": ("AUTOCALL", ".append"), "ARGZ": ("INLINE", "_lol_sys.argv"), "THINGZ": ("INLINE", "()"), # invisible tuple didn't sound right "THING": ("INLINE", "()"), # sometimes it's better in singular form "MY": ("INLINE", "self."), "MYSELF": ("INLINE", "(self)"), "EVEN": ("INLINE", "% 2 == 0"), "ODD": ("INLINE", "% 2 == 1"), "WIF": ("RESERVED", "with"), } def t_FLOAT(t): r"""(?:\d+(?:\.\d*)?|\.\d+)(?:[eE][-+]? \d+)?""" t.value = t.value t.type = "NUMBER" return t def t_INT(t): r"\d+" t.type = "NUMBER" return t def t_INVISIBLE(t): r"INVISIBLE([ ]+(LIST|STRING|BUCKET))?\b" if "LIST" in t.value: t.type = "INLINE" t.value = "[]" elif "STRING" in t.value: t.type = "INLINE" t.value = '""' elif "BUCKET" in t.value: t.type = "INLINE" t.value = "{}" else: RESERVED(t, "None") return t # Not consuming the newline. Needed for "IZ EASTERBUNNY? BTW comment" def t_COMMENT(t): r"[ ]*(?:BTW|WTF)[^\n]*" return t def t_NAME(t): r'[a-zA-Z_][a-zA-Z0-9_]*' if t.value in RESERVED_VALUES: type, value = RESERVED_VALUES[t.value] t.type = type t.value = value if t.type == "AUTOCALL": t.lexer.paren_stack.append(")") return t def t_WS(t): r' [ ]+ ' if t.lexer.at_line_start and not t.lexer.paren_stack: return t # Don't generate newline tokens when inside of parens def t_newline(t): r'\n+' t.lexer.lineno += len(t.value) t.type = "NEWLINE" if not t.lexer.paren_stack: return t def t_error(t): #raise SyntaxError("Unknown symbol %r" % (t.value[0],)) print "Skipping", repr(t.value[0]) t.lexer.skip(1) ## I implemented INDENT / DEDENT generation as a post-processing filter # The original lex token stream contains WS and NEWLINE characters. # WS will only occur before any other tokens on a line. # I have three filters. One tags tokens by adding two attributes. # "must_indent" is True if the token must be indented from the # previous code. The other is "at_line_start" which is True for WS # and the first non-WS/non-NEWLINE on a line. It flags the check so # see if the new line has changed indication level. # Python's syntax has three INDENT states # 0) no colon hence no need to indent # 1) "if 1: go()" - simple statements have a COLON but no need for an indent # 2) "if 1:\n go()" - complex statements have a COLON NEWLINE and must indent NO_INDENT = 0 MAY_INDENT = 1 MUST_INDENT = 2 # only care about whitespace at the start of a line def track_tokens_filter(lexer, tokens): lexer.at_line_start = at_line_start = True indent = NO_INDENT for token in tokens: token.at_line_start = at_line_start if token.type == "COLON": at_line_start = False indent = MAY_INDENT token.must_indent = False elif token.type == "NEWLINE": at_line_start = True if indent == MAY_INDENT: indent = MUST_INDENT token.must_indent = False elif token.type == "WS": assert token.at_line_start == True at_line_start = True token.must_indent = False elif token.type == "COMMENT": pass else: # A real token; only indent after COLON NEWLINE if indent == MUST_INDENT: token.must_indent = True else: token.must_indent = False at_line_start = False indent = NO_INDENT yield token lexer.at_line_start = at_line_start def _new_token(type, lineno): tok = lex.LexToken() tok.type = type tok.value = None tok.lineno = lineno tok.lexpos = -1 return tok # Synthesize a DEDENT tag def DEDENT(lineno): return _new_token("DEDENT", lineno) # Synthesize an INDENT tag def INDENT(lineno): return _new_token("INDENT", lineno) # Track the indentation level and emit the right INDENT / DEDENT events. def indentation_filter(tokens): # A stack of indentation levels; will never pop item 0 levels = [0] token = None depth = 0 prev_was_ws = False for token in tokens: ## if 1: ## print "Process", token, ## if token.at_line_start: ## print "at_line_start", ## if token.must_indent: ## print "must_indent", ## print # WS only occurs at the start of the line # There may be WS followed by NEWLINE so # only track the depth here. Don't indent/dedent # until there's something real. if token.type == "WS": assert depth == 0 depth = len(token.value) prev_was_ws = True # Don't forward WS to the parser continue if token.type == "NEWLINE": depth = 0 if prev_was_ws or token.at_line_start: # ignore blank lines continue # pass the other cases on through yield token continue if token.type == "COMMENT": yield token continue # then it must be a real token (not WS, not NEWLINE) # which can affect the indentation level prev_was_ws = False if token.must_indent: # The current depth must be larger than the previous level if not (depth > levels[-1]): raise IndentationError("expected an indented block") levels.append(depth) yield INDENT(token.lineno) elif token.at_line_start: # Must be on the same level or one of the previous levels if depth == levels[-1]: # At the same level pass elif depth > levels[-1]: raise IndentationError("indentation increase but not in new block") else: # Back up; but only if it matches a previous level try: i = levels.index(depth) except ValueError: raise IndentationError("inconsistent indentation") for _ in range(i+1, len(levels)): yield DEDENT(token.lineno) levels.pop() yield token ### Finished processing ### # Must dedent any remaining levels if len(levels) > 1: assert token is not None for _ in range(1, len(levels)): yield DEDENT(token.lineno) # The top-level filter adds an ENDMARKER, if requested. # Python's grammar uses it. def token_filter(lexer, add_endmarker = True): token = None tokens = iter(lexer.token, None) tokens = track_tokens_filter(lexer, tokens) for token in indentation_filter(tokens): yield token if add_endmarker: lineno = 1 if token is not None: lineno = token.lineno yield _new_token("ENDMARKER", lineno) class LOLLexer(object): def __init__(self, debug=0, optimize=0, lextab='lextab', reflags=0): self.lexer = lex.lex(debug=debug, optimize=optimize, lextab=lextab, reflags=reflags) self.token_stream = None def input(self, s, add_endmarker=True): self.lexer.paren_stack = [] self.lexer.input(s) self.token_stream = token_filter(self.lexer, add_endmarker) def token(self): try: return except StopIteration: return None # Helper class to generate logically correct indented Python code class IndentWriter(object): def __init__(self, outfile): self.outfile = outfile self.at_first_column = True self.indent = 0 def write(self, text): if self.at_first_column: self.outfile.write(" "*self.indent) self.at_first_column = False self.outfile.write(text) # Split things up because the from __future__ statements must # go before any other code. HEADER = """# LOLPython to Python converter version 1.0 # Written by Andrew Dalke, who should have been working on better things, and modified for Nahamcon CTF by Kkevsterrr. """ BODY = """ # sys is used for COMPLAIN and ARGZ import sys as _lol_sys """ def to_python(s): L = LOLLexer() L.input(s) header = StringIO() header.write(HEADER) header_output = IndentWriter(header) body = StringIO() body.write(BODY) body_output = IndentWriter(body) write = body_output.write output = body_output for t in iter(L.token_stream): if t.type == "NAME": # Need to escape names which are Python variables Do that # by appending an "_". But then I also need to make sure # that "yield_" does not collide with "yield". And you # thought you were being clever trying to use a Python # variable. :) name = t.value.rstrip("_") if name in keyword.kwlist: write(t.value + "_ ") else: write(t.value + " ") elif t.type in ("RESERVED", "OP", "NUMBER", "CLOSE"): # While not pretty, I'll put a space after each # term because it's the simplest solution. Otherwise # I'll need to track the amount of whitespace between # the tokens in the original text. write(t.value+" ") # XXX escape names which are special in Python! elif t.type == "STRING": write(repr(t.value) + " ") elif t.type == "COMMENT": # Not enough information to keep comments on the correct # indentation level. This is good enough. Ugly though. # Maybe I need to fix the tokenizer. write("#"+ t.value[3:]+"\n") output.at_first_column = True elif t.type == "COLON": write(":") elif t.type == "INDENT": output.indent += 1 pass elif t.type == "DEDENT": output.indent -= 1 pass elif t.type == "NEWLINE": write(t.value) output.at_first_column = True output = body_output write = output.write elif t.type == "PRINT": if t.value == "stdout": write("print ") elif t.value == "stderr": write("print >>_lol_sys.stderr, ") else: raise AssertionError(t.value) elif t.type == "AUTOCALL": write(t.value + "(") elif t.type == "INLINE": write(t.value) elif t.type == "ENDMARKER": write("\n# The end.\n") elif t.type == "WS": output.leading_ws = t.value elif t.type == "FUTURE": # Write to the header. This is a hack. Err, a hairball. output = header_output write = output.write write("from __future__ import ") else: raise AssertionError(t.type) return header.getvalue() + body.getvalue() # API code for doing the translation and exec'ing the result def execfile(infile, module_name="__lolmain__"): "file, module_name -- exec the lolpython file in a newly created module" if not hasattr(infile, "read"): s = open(infile).read() else: s = return execstring(s, module_name) def execstring(s, module_name="__lolmain__"): "s, module_name -- exec the lolpython string in a newly created module" python_s = to_python(s) # Doing this bit of trickiness so I can have LOLPython code act # like __main__. This fix is enough to fool unittest. m = types.ModuleType(module_name) sys.modules[module_name] = m exec python_s in m.__dict__ return m def convert_file(infile, outfile): "read LOLPython code from infile, write converted Python code to outfile" if not hasattr(outfile, "write"): outfile = open(outfile, "w") outfile.write(to_python( def convert(filenames): "convert LOLPython filenames into corresponding Python '.py' files" if not filenames: convert_file(sys.stdin, sys.stdout) else: for filename in filenames: base, ext = os.path.splitext(filename) convert_file(open(filename), open(base+".py", "w")) def help(): print """convert and run a lolpython program Commands are: lolpython Read a lolpython program from stdin and execute it lolpython --convert Convert a lolpython program from stdin and generate python to stdout lolpython --convert filename1 [filename....] Convert a list of lolpython files into Python files lolpython filename [arg1 [arg2 ...]] Run a lolpython program using optional arguments """ def main(argv): if len(argv) >= 2: if argv[1] == "--convert": convert(argv[2:]) return if argv[1] == "--help": help() return if argv[1] == "--version": print __NAME__ + " " + __VERSION__ return # otherwise, run the lolpython program sys.argv = sys.argv[1:] filename = sys.argv[0] execfile(filename, "__main__") else: # commands from stdin execfile(sys.stdin) if __name__ == "__main__": main(sys.argv)