#!/usr/bin/env python ################################################################################ # # qooxdoo - the new era of web development # # http://qooxdoo.org # # Copyright: # 2006-2008 1&1 Internet AG, Germany, http://www.1und1.de # # License: # LGPL: http://www.gnu.org/licenses/lgpl.html # EPL: http://www.eclipse.org/org/documents/epl-v10.php # See the LICENSE file in the project's top-level directory for details. # # Authors: # * Sebastian Werner (wpbasti) # * Alessandro Sala (asala) # ################################################################################ ## #

Module Description

#
# NAME
#  tokenizer.py -- create tokens from JavaScript source code
#
# SYNTAX
#  tokenizer.py --help
#
#  or
#
#  import tokenizer
#  result = tokenizer.parseStream(string, id)
#
# DESCRIPTION
#  The module tokenizer.py creates JSON-style tokens from JavaScript source code.
#
#
## import sys, string, re, optparse import config, filetool, comment R_WHITESPACE = re.compile(r"(\s+)") R_NONWHITESPACE = re.compile("\S+") R_NUMBER = re.compile("^[0-9]+") R_NEWLINE = re.compile(r"(\n)") # Ideas from: http://www.regular-expressions.info/examplesprogrammer.html # Multicomment RegExp inspired by: http://ostermiller.org/findcomment.html # builds regexp strings S_STRING_A = "'[^'\\\n]*(\\.|\n[^'\\\n]*)*'" S_STRING_B = '"[^"\\\n]*(\\.|\n[^"\\\n]*)*"' S_FLOAT = "([0-9]*\.[0-9]+(?:[eE][+-]?[0-9]+)?)" S_OPERATORS_2 = r"(==)|(!=)|(\+\+)|(--)|(-=)|(\+=)|(\*=)|(/=)|(%=)|(&&)|(\|\|)|(\>=)|(\<=)|(>>)|(<<)|(\^\|)|(\|=)|(\^=)|(&=)|(::)|(\.\.)" S_OPERATORS_3 = r"(===)|(!==)|(\<\<=)|(\>\>=)|(\>\>\>)" S_OPERATORS_4 = r"(\>\>\>=)" S_OPERATORS = "(" + S_OPERATORS_4 + "|" + S_OPERATORS_3 + "|" + S_OPERATORS_2 + ")" S_REGEXP = "(\/(?!\*)[^\t\n\r\f\v\/]+?\/[mgi]*)" #S_REGEXP = "(\/[^\t\n\r\f\v\/]+?\/[mgi]*)" S_REGEXP_A = "\.(match|search|split)\s*\(\s*\(*\s*" + S_REGEXP + "\s*\)*\s*\)" S_REGEXP_B = "\.(replace)\s*\(\s*\(*\s*" + S_REGEXP + "\s*\)*\s*?,?" S_REGEXP_C = "\s*\(*\s*" + S_REGEXP + "\)*\.(test|exec)\s*\(\s*" S_REGEXP_D = "(:|=|\?)\s*\(*\s*" + S_REGEXP + "\s*\)*" S_REGEXP_ALL = S_REGEXP_A + "|" + S_REGEXP_B + "|" + S_REGEXP_C + "|" + S_REGEXP_D #S_REGEXP_ALL = "(?P" + S_REGEXP_A + "|" + S_REGEXP_B + "|" + S_REGEXP_C + "|" + S_REGEXP_D + ")" # I would rather group only on the top-level expression, and there create a named group # (sub-groups only if in dire need); the named groups provide not only the match, but # also the classification (like "REGEXP"), to be retrieved through mo.groupdict(). this # would allow you to build a tokenizer through regexps entirely. S_ALL = "(" + comment.S_BLOCK_COMMENT + "|" + comment.S_INLINE_COMMENT + "|" + S_STRING_A + "|" + S_STRING_B + "|" + S_REGEXP_ALL + "|" + S_FLOAT + "|" + S_OPERATORS + ")" # compile regexp strings R_STRING_A = re.compile("^" + S_STRING_A + "$") R_STRING_B = re.compile("^" + S_STRING_B + "$") R_FLOAT = re.compile("^" + S_FLOAT + "$") R_OPERATORS = re.compile(S_OPERATORS) R_REGEXP = re.compile(S_REGEXP) R_REGEXP_A = re.compile(S_REGEXP_A) R_REGEXP_B = re.compile(S_REGEXP_B) R_REGEXP_C = re.compile(S_REGEXP_C) R_REGEXP_D = re.compile(S_REGEXP_D) R_ALL = re.compile(S_ALL) parseLine = 1 parseColumn = 1 parseUniqueId = "" def protectEscape(s): return s.replace("\\\\", "__$ESCAPE0$__").replace("\\\"", "__$ESCAPE1$__").replace("\\\'", "__$ESCAPE2__").replace("\/", "__$ESCAPE3__").replace("\!", "__$ESCAPE4__") def recoverEscape(s): return s.replace("__$ESCAPE0$__", "\\\\").replace("__$ESCAPE1$__", "\\\"").replace("__$ESCAPE2__", "\\'").replace("__$ESCAPE3__", "\/").replace("__$ESCAPE4__", "\!") def parseElement(element): global parseUniqueId global parseLine global parseColumn if config.JSRESERVED.has_key(element): # print "PROTECTED: %s" % JSRESERVED[content] obj = { "type" : "reserved", "detail" : config.JSRESERVED[element], "source" : element, "line" : parseLine, "column" : parseColumn, "id" : parseUniqueId } elif element in config.JSBUILTIN: # print "BUILTIN: %s" % content obj = { "type" : "builtin", "detail" : "", "source" : element, "line" : parseLine, "column" : parseColumn, "id" : parseUniqueId } elif R_NUMBER.search(element): # print "NUMBER: %s" % content obj = { "type" : "number", "detail" : "int", "source" : element, "line" : parseLine, "column" : parseColumn, "id" : parseUniqueId } elif element.startswith("__"): # print "PRIVATE NAME: %s" % content obj = { "type" : "name", "detail" : "private", "source" : element, "line" : parseLine, "column" : parseColumn, "id" : parseUniqueId } elif element.startswith("_"): # print "PROTECTED NAME: %s" % content obj = { "type" : "name", "detail" : "protected", "source" : element, "line" : parseLine, "column" : parseColumn, "id" : parseUniqueId } elif len(element) > 0: # print "PUBLIC NAME: %s" % content obj = { "type" : "name", "detail" : "public", "source" : element, "line" : parseLine, "column" : parseColumn, "id" : parseUniqueId } parseColumn += len(element) return obj def parsePart(part): global parseUniqueId global parseLine global parseColumn tokens = [] element = "" for line in R_NEWLINE.split(part): if line == "\n": tokens.append({ "type" : "eol", "source" : "", "detail" : "", "line" : parseLine, "column" : parseColumn, "id" : parseUniqueId }) parseColumn = 1 parseLine += 1 else: for item in R_WHITESPACE.split(line): if item == "": continue if not R_NONWHITESPACE.search(item): parseColumn += len(item) continue # print "ITEM: '%s'" % item # doing the per-char iteration by hand, to be able to leap # forward i = 0 while item[i:]: #for char in item: # look for a regexp mo = R_REGEXP.match(item[i:]) if mo: # if this thingy looks like a regexp, look that the preceding token is no # "left-hand operand" that might turn the expression into a division # convert existing element if element != "": if R_NONWHITESPACE.search(element): tokens.append(parseElement(element)) element = "" # look behind if ( (tokens[-1]['detail'] != 'int') and (tokens[-1]['detail'] != 'float') and (tokens[-1]['detail'] != 'RP') and (tokens[-1]['detail'] != 'public')): tokens.append({ "type" : "regexp", "detail" : "", "source" : recoverEscape(mo.group(0)), "id" : parseUniqueId, "line" : parseLine, "column" : parseColumn }) parseColumn += len(mo.group(0)) i += len(mo.group(0)) # work on single character tokens, otherwise concat to a bigger element char = item[i] i += 1 if config.JSTOKENS.has_key(char): # convert existing element if element != "": if R_NONWHITESPACE.search(element): tokens.append(parseElement(element)) element = "" # add character to token list tokens.append({ "type" : "token", "detail" : config.JSTOKENS[char], "source" : char, "line" : parseLine, "column" : parseColumn, "id" : parseUniqueId }) parseColumn += 1 else: element += char # convert remaining stuff to tokens if element != "": if R_NONWHITESPACE.search(element): tokens.append(parseElement(element)) element = "" return tokens ## # parseFragmentLead -- find starting char POS of pattern match result # in source text , process 's prefix up to POS, thereby # building up token array , # and return without the processed prefix # def parseFragmentLead(content, fragment, tokens): pos = content.find(fragment) if pos > 0: tokens.extend(parsePart(recoverEscape(content[0:pos]))) return content[pos+len(fragment):] def hasLeadingContent(tokens): pos = len(tokens) - 1 while pos > 0: if tokens[pos]["type"] == "eol": break else: return True return False ## # Main parsing routine, in that it qualifies tokens from the stream (operators, # nums, strings, ...) # def parseStream(content, uniqueId=""): # make global variables available global parseLine global parseColumn global parseUniqueId # reset global stuff parseColumn = 1 parseLine = 1 parseUniqueId = uniqueId # prepare storage tokens = [] content = protectEscape(content) # print " * searching for patterns..." try: all = R_ALL.findall(content) except RuntimeError: print "Could not parse file %s" % uniqueId print "Generally this means that there is a syntactial problem with your source-code." print "Please omit the usage of nested comments like '/* foo /* bar */'." sys.exit(1) # print " * structuring..." for item in all: fragment = item[0] # print "Found: '%s'" % fragment # Handle block comment if comment.R_BLOCK_COMMENT.match(fragment): source = recoverEscape(fragment) format = comment.getFormat(source) multiline = comment.isMultiLine(source) # print "Type:MultiComment" content = parseFragmentLead(content, fragment, tokens) # sort of intelligent "pop" atBegin = not hasLeadingContent(tokens) if re.compile("^\s*\n").search(content): atEnd = True else: atEnd = False # print "Begin: %s, End: %s" % (atBegin, atEnd) # Fixing source content if atBegin: source = comment.outdent(source, parseColumn - 1) source = comment.correct(source) connection = "before" if atEnd and not atBegin: connection = "after" else: connection = "before" tokens.append({ "type" : "comment", "detail" : format, "multiline" : multiline, "connection" : connection, "source" : source, "id" : parseUniqueId, "line" : parseLine, "column" : parseColumn, "begin" : atBegin, "end" : atEnd }) parseLine += len(fragment.split("\n")) - 1 # Handle inline comment elif comment.R_INLINE_COMMENT.match(fragment): # print "Type:SingleComment" source = recoverEscape(fragment) content = parseFragmentLead(content, fragment, tokens) atBegin = hasLeadingContent(tokens) atEnd = True if atBegin: connection = "after" else: connection = "before" source = comment.correct(source) tokens.append({ "type" : "comment", "detail" : "inline", "multiline" : False, "connection" : connection, "source" : source, "id" : parseUniqueId, "line" : parseLine, "column" : parseColumn, "begin" : atBegin, "end" : atEnd }) # Handle string elif R_STRING_A.match(fragment): # print "Type:StringA: %s" % fragment content = parseFragmentLead(content, fragment, tokens) source = recoverEscape(fragment)[1:-1] tokens.append({ "type" : "string", "detail" : "singlequotes", "source" : source.replace("\\\n",""), "id" : parseUniqueId, "line" : parseLine, "column" : parseColumn }) newLines = source.count("\\\n") parseLine += newLines if newLines: parseColumn = len(source) - source.rfind("\\\n") + 2 else: parseColumn += len(source) + 2 # Handle string elif R_STRING_B.match(fragment): # print "Type:StringB: %s" % fragment content = parseFragmentLead(content, fragment, tokens) source = recoverEscape(fragment)[1:-1] tokens.append({ "type" : "string", "detail" : "doublequotes", "source" : source.replace("\\\n",""), "id" : parseUniqueId, "line" : parseLine, "column" : parseColumn }) newLines = source.count("\\\n") parseLine += newLines if newLines: parseColumn = len(source) - source.rfind("\\\n") + 2 else: parseColumn += len(source) + 2 # Handle float num elif R_FLOAT.match(fragment): # print "Type:Float: %s" % fragment content = parseFragmentLead(content, fragment, tokens) tokens.append({ "type" : "number", "detail" : "float", "source" : fragment, "id" : parseUniqueId, "line" : parseLine, "column" : parseColumn }) # Handle operator elif R_OPERATORS.match(fragment): # print "Type:Operator: %s" % fragment content = parseFragmentLead(content, fragment, tokens) tokens.append({ "type" : "token", "detail" : config.JSTOKENS[fragment], "source" : fragment, "id" : parseUniqueId, "line" : parseLine, "column" : parseColumn }) # Handle everything else else: fragresult = R_REGEXP.search(fragment) if fragresult: # print "Type:RegExp: %s" % fragresult.group(0) if R_REGEXP_A.match(fragment) or R_REGEXP_B.match(fragment) or R_REGEXP_C.match(fragment) or R_REGEXP_D.match(fragment): content = parseFragmentLead(content, fragresult.group(0), tokens) tokens.append({ "type" : "regexp", "detail" : "", "source" : recoverEscape(fragresult.group(0)), "id" : parseUniqueId, "line" : parseLine, "column" : parseColumn }) else: print "Bad regular expression: %s" % fragresult.group(0) else: print "Type:None!" tokens.extend(parsePart(recoverEscape(content))) tokens.append({ "type" : "eof", "source" : "", "detail" : "", "id" : parseUniqueId, "line" : parseLine, "column" : parseColumn }) return tokens def parseFile(fileName, uniqueId="", encoding="utf-8"): return parseStream(filetool.read(fileName, encoding), uniqueId) def convertTokensToString(tokens): tokenizedString = "" for token in tokens: tokenizedString += "%s%s" % (token, "\n") return tokenizedString def main(): parser = optparse.OptionParser() parser.add_option("-w", "--write", action="store_true", dest="write", default=False, help="Writes file to incoming fileName + EXTENSION.") parser.add_option("-e", "--extension", dest="extension", metavar="EXTENSION", help="The EXTENSION to use", default=".tokenized") parser.add_option("--encoding", dest="encoding", default="utf-8", metavar="ENCODING", help="Defines the encoding expected for input files.") (options, args) = parser.parse_args() if len(args) == 0: print "Needs one or more arguments (files) to tokenize!" sys.exit(1) for fileName in args: if options.write: print "Compiling %s => %s%s" % (fileName, fileName, options.extension) else: print "Compiling %s => stdout" % fileName tokenString = convertTokensToString(parseFile(fileName, fileName, options.encoding)) if options.write: filetool.save(fileName + options.extension, tokenString, options.encoding) else: try: print tokenString except UnicodeEncodeError: print " * Could not encode result to ascii. Use '-w' instead." sys.exit(1) if __name__ == '__main__': try: main() except KeyboardInterrupt: print print " * Keyboard Interrupt" sys.exit(1)