Spaces:
Sleeping
Sleeping
;;; ECMAScript for Guile | |
;; Copyright (C) 2009, 2010, 2011 Free Software Foundation, Inc. | |
;;;; This library is free software; you can redistribute it and/or | |
;;;; modify it under the terms of the GNU Lesser General Public | |
;;;; License as published by the Free Software Foundation; either | |
;;;; version 3 of the License, or (at your option) any later version. | |
;;;; | |
;;;; This library is distributed in the hope that it will be useful, | |
;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of | |
;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
;;;; Lesser General Public License for more details. | |
;;;; | |
;;;; You should have received a copy of the GNU Lesser General Public | |
;;;; License along with this library; if not, write to the Free Software | |
;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
;;; Code: | |
(define-module (language ecmascript tokenize) | |
#:use-module (ice-9 rdelim) | |
#:use-module ((srfi srfi-1) #:select (unfold-right)) | |
#:use-module (system base lalr) | |
#:export (next-token make-tokenizer make-tokenizer/1 tokenize tokenize/1)) | |
(define (syntax-error what loc form . args) | |
(throw 'syntax-error #f what | |
(and=> loc source-location->source-properties) | |
form #f args)) | |
(define (port-source-location port) | |
(make-source-location (port-filename port) | |
(port-line port) | |
(port-column port) | |
(false-if-exception (ftell port)) | |
#f)) | |
;; taken from SSAX, sorta | |
(define (read-until delims port loc) | |
(if (eof-object? (peek-char port)) | |
(syntax-error "EOF while reading a token" loc #f) | |
(let ((token (read-delimited delims port 'peek))) | |
(if (eof-object? (peek-char port)) | |
(syntax-error "EOF while reading a token" loc token) | |
token)))) | |
(define (char-hex? c) | |
(and (not (eof-object? c)) | |
(or (char-numeric? c) | |
(memv c '(#\a #\b #\c #\d #\e #\f)) | |
(memv c '(#\A #\B #\C #\D #\E #\F))))) | |
(define (digit->number c) | |
(- (char->integer c) (char->integer #\0))) | |
(define (hex->number c) | |
(if (char-numeric? c) | |
(digit->number c) | |
(+ 10 (- (char->integer (char-downcase c)) (char->integer #\a))))) | |
(define (read-slash port loc div?) | |
(let ((c1 (begin | |
(read-char port) | |
(peek-char port)))) | |
(cond | |
((eof-object? c1) | |
;; hmm. error if we're not looking for a div? ? | |
(make-lexical-token '/ loc #f)) | |
((char=? c1 #\/) | |
(read-line port) | |
(next-token port div?)) | |
((char=? c1 #\*) | |
(read-char port) | |
(let lp ((c (read-char port))) | |
(cond | |
((eof-object? c) | |
(syntax-error "EOF while in multi-line comment" loc #f)) | |
((char=? c #\*) | |
(if (eqv? (peek-char port) #\/) | |
(begin | |
(read-char port) | |
(next-token port div?)) | |
(lp (read-char port)))) | |
(else | |
(lp (read-char port)))))) | |
(div? | |
(case c1 | |
((#\=) (read-char port) (make-lexical-token '/= loc #f)) | |
(else (make-lexical-token '/ loc #f)))) | |
(else | |
(read-regexp port loc))))) | |
(define (read-regexp port loc) | |
;; first slash already read | |
(let ((terms (string #\/ #\\ #\nl #\cr))) | |
(let lp ((str (read-until terms port loc)) (head "")) | |
(let ((terminator (peek-char port))) | |
(cond | |
((char=? terminator #\/) | |
(read-char port) | |
;; flags | |
(let lp ((c (peek-char port)) (flags '())) | |
(if (or (eof-object? c) | |
(not (or (char-alphabetic? c) | |
(char-numeric? c) | |
(char=? c #\$) | |
(char=? c #\_)))) | |
(make-lexical-token 'RegexpLiteral loc | |
(cons (string-append head str) | |
(reverse flags))) | |
(begin (read-char port) | |
(lp (peek-char port) (cons c flags)))))) | |
((char=? terminator #\\) | |
(read-char port) | |
(let ((echar (read-char port))) | |
(lp (read-until terms port loc) | |
(string-append head str (string #\\ echar))))) | |
(else | |
(syntax-error "regexp literals may not contain newlines" | |
loc str))))))) | |
(define (read-string port loc) | |
(let ((c (read-char port))) | |
(let ((terms (string c #\\ #\nl #\cr))) | |
(define (read-escape port) | |
(let ((c (read-char port))) | |
(case c | |
((#\' #\" #\\) c) | |
((#\b) #\bs) | |
((#\f) #\np) | |
((#\n) #\nl) | |
((#\r) #\cr) | |
((#\t) #\tab) | |
((#\v) #\vt) | |
((#\0) | |
(let ((next (peek-char port))) | |
(cond | |
((eof-object? next) #\nul) | |
((char-numeric? next) | |
(syntax-error "octal escape sequences are not supported" | |
loc #f)) | |
(else #\nul)))) | |
((#\x) | |
(let* ((a (read-char port)) | |
(b (read-char port))) | |
(cond | |
((and (char-hex? a) (char-hex? b)) | |
(integer->char (+ (* 16 (hex->number a)) (hex->number b)))) | |
(else | |
(syntax-error "bad hex character escape" loc (string a b)))))) | |
((#\u) | |
(let* ((a (read-char port)) | |
(b (read-char port)) | |
(c (read-char port)) | |
(d (read-char port))) | |
(integer->char (string->number (string a b c d) 16)))) | |
(else | |
c)))) | |
(let lp ((str (read-until terms port loc))) | |
(let ((terminator (peek-char port))) | |
(cond | |
((char=? terminator c) | |
(read-char port) | |
(make-lexical-token 'StringLiteral loc str)) | |
((char=? terminator #\\) | |
(read-char port) | |
(let ((echar (read-escape port))) | |
(lp (string-append str (string echar) | |
(read-until terms port loc))))) | |
(else | |
(syntax-error "string literals may not contain newlines" | |
loc str)))))))) | |
(define *keywords* | |
'(("break" . break) | |
("else" . else) | |
("new" . new) | |
("var" . var) | |
("case" . case) | |
("finally" . finally) | |
("return" . return) | |
("void" . void) | |
("catch" . catch) | |
("for" . for) | |
("switch" . switch) | |
("while" . while) | |
("continue" . continue) | |
("function" . function) | |
("this" . this) | |
("with" . with) | |
("default" . default) | |
("if" . if) | |
("throw" . throw) | |
("delete" . delete) | |
("in" . in) | |
("try" . try) | |
("do" . do) | |
("instanceof" . instanceof) | |
("typeof" . typeof) | |
;; these aren't exactly keywords, but hey | |
("null" . null) | |
("true" . true) | |
("false" . false))) | |
(define *future-reserved-words* | |
'(("abstract" . abstract) | |
("enum" . enum) | |
("int" . int) | |
("short" . short) | |
("boolean" . boolean) | |
("export" . export) | |
("interface" . interface) | |
("static" . static) | |
("byte" . byte) | |
("extends" . extends) | |
("long" . long) | |
("super" . super) | |
("char" . char) | |
("final" . final) | |
("native" . native) | |
("synchronized" . synchronized) | |
("class" . class) | |
("float" . float) | |
("package" . package) | |
("throws" . throws) | |
("const" . const) | |
("goto" . goto) | |
("private" . private) | |
("transient" . transient) | |
("debugger" . debugger) | |
("implements" . implements) | |
("protected" . protected) | |
("volatile" . volatile) | |
("double" . double) | |
("import" . import) | |
("public" . public))) | |
(define (read-identifier port loc) | |
(let lp ((c (peek-char port)) (chars '())) | |
(if (or (eof-object? c) | |
(not (or (char-alphabetic? c) | |
(char-numeric? c) | |
(char=? c #\$) | |
(char=? c #\_)))) | |
(let ((word (list->string (reverse chars)))) | |
(cond ((assoc-ref *keywords* word) | |
=> (lambda (x) (make-lexical-token x loc #f))) | |
((assoc-ref *future-reserved-words* word) | |
(syntax-error "word is reserved for the future, dude." | |
loc word)) | |
(else (make-lexical-token 'Identifier loc | |
(string->symbol word))))) | |
(begin (read-char port) | |
(lp (peek-char port) (cons c chars)))))) | |
(define (read-numeric port loc) | |
(let* ((c0 (if (char=? (peek-char port) #\.) | |
#\0 | |
(read-char port))) | |
(c1 (peek-char port))) | |
(cond | |
((eof-object? c1) (digit->number c0)) | |
((and (char=? c0 #\0) (or (char=? c1 #\x) (char=? c1 #\X))) | |
(read-char port) | |
(let ((c (peek-char port))) | |
(if (not (char-hex? c)) | |
(syntax-error "bad digit reading hexadecimal number" | |
loc c)) | |
(let lp ((c c) (acc 0)) | |
(cond ((char-hex? c) | |
(read-char port) | |
(lp (peek-char port) | |
(+ (* 16 acc) (hex->number c)))) | |
(else | |
acc))))) | |
((and (char=? c0 #\0) (char-numeric? c1)) | |
(let lp ((c c1) (acc 0)) | |
(cond ((eof-object? c) acc) | |
((char-numeric? c) | |
(if (or (char=? c #\8) (char=? c #\9)) | |
(syntax-error "invalid digit in octal sequence" | |
loc c)) | |
(read-char port) | |
(lp (peek-char port) | |
(+ (* 8 acc) (digit->number c)))) | |
(else | |
acc)))) | |
(else | |
(let lp ((c1 c1) (acc (digit->number c0))) | |
(cond | |
((eof-object? c1) acc) | |
((char-numeric? c1) | |
(read-char port) | |
(lp (peek-char port) | |
(+ (* 10 acc) (digit->number c1)))) | |
((or (char=? c1 #\e) (char=? c1 #\E)) | |
(read-char port) | |
(let ((add (let ((c (peek-char port))) | |
(cond ((eof-object? c) | |
(syntax-error "error reading exponent: EOF" | |
loc #f)) | |
((char=? c #\+) (read-char port) +) | |
((char=? c #\-) (read-char port) -) | |
((char-numeric? c) +) | |
(else | |
(syntax-error "error reading exponent: non-digit" | |
loc c)))))) | |
(let lp ((c (peek-char port)) (e 0)) | |
(cond ((and (not (eof-object? c)) (char-numeric? c)) | |
(read-char port) | |
(lp (peek-char port) (add (* 10 e) (digit->number c)))) | |
(else | |
(* (if (negative? e) (* acc 1.0) acc) (expt 10 e))))))) | |
((char=? c1 #\.) | |
(read-char port) | |
(let lp2 ((c (peek-char port)) (dec 0.0) (n -1)) | |
(cond ((and (not (eof-object? c)) (char-numeric? c)) | |
(read-char port) | |
(lp2 (peek-char port) | |
(+ dec (* (digit->number c) (expt 10 n))) | |
(1- n))) | |
(else | |
;; loop back to catch an exponential part | |
(lp c (+ acc dec)))))) | |
(else | |
acc))))))) | |
(define *punctuation* | |
'(("{" . lbrace) | |
("}" . rbrace) | |
("(" . lparen) | |
(")" . rparen) | |
("[" . lbracket) | |
("]" . rbracket) | |
("." . dot) | |
(";" . semicolon) | |
("," . comma) | |
("<" . <) | |
(">" . >) | |
("<=" . <=) | |
(">=" . >=) | |
("==" . ==) | |
("!=" . !=) | |
("===" . ===) | |
("!==" . !==) | |
("+" . +) | |
("-" . -) | |
("*" . *) | |
("%" . %) | |
("++" . ++) | |
("--" . --) | |
("<<" . <<) | |
(">>" . >>) | |
(">>>" . >>>) | |
("&" . &) | |
("|" . bor) | |
("^" . ^) | |
("!" . !) | |
("~" . ~) | |
("&&" . &&) | |
("||" . or) | |
("?" . ?) | |
(":" . colon) | |
("=" . =) | |
("+=" . +=) | |
("-=" . -=) | |
("*=" . *=) | |
("%=" . %=) | |
("<<=" . <<=) | |
(">>=" . >>=) | |
(">>>=" . >>>=) | |
("&=" . &=) | |
("|=" . bor=) | |
("^=" . ^=))) | |
(define *div-punctuation* | |
'(("/" . /) | |
("/=" . /=))) | |
;; node ::= (char (symbol | #f) node*) | |
(define read-punctuation | |
(let ((punc-tree (let lp ((nodes '()) (puncs *punctuation*)) | |
(cond ((null? puncs) | |
nodes) | |
((assv-ref nodes (string-ref (caar puncs) 0)) | |
=> (lambda (node-tail) | |
(if (= (string-length (caar puncs)) 1) | |
(set-car! node-tail (cdar puncs)) | |
(set-cdr! node-tail | |
(lp (cdr node-tail) | |
`((,(substring (caar puncs) 1) | |
. ,(cdar puncs)))))) | |
(lp nodes (cdr puncs)))) | |
(else | |
(lp (cons (list (string-ref (caar puncs) 0) #f) nodes) | |
puncs)))))) | |
(lambda (port loc) | |
(let lp ((c (peek-char port)) (tree punc-tree) (candidate #f)) | |
(cond | |
((assv-ref tree c) | |
=> (lambda (node-tail) | |
(read-char port) | |
(lp (peek-char port) (cdr node-tail) (car node-tail)))) | |
(candidate | |
(make-lexical-token candidate loc #f)) | |
(else | |
(syntax-error "bad syntax: character not allowed" loc c))))))) | |
(define (next-token port div?) | |
(let ((c (peek-char port)) | |
(loc (port-source-location port))) | |
(case c | |
((#\ht #\vt #\np #\space #\x00A0) ; whitespace | |
(read-char port) | |
(next-token port div?)) | |
((#\newline #\cr) ; line break | |
(read-char port) | |
(next-token port div?)) | |
((#\/) | |
;; division, single comment, double comment, or regexp | |
(read-slash port loc div?)) | |
((#\" #\') ; string literal | |
(read-string port loc)) | |
(else | |
(cond | |
((eof-object? c) | |
'*eoi*) | |
((or (char-alphabetic? c) | |
(char=? c #\$) | |
(char=? c #\_)) | |
;; reserved word or identifier | |
(read-identifier port loc)) | |
((char-numeric? c) | |
;; numeric -- also accept . FIXME, requires lookahead | |
(make-lexical-token 'NumericLiteral loc (read-numeric port loc))) | |
(else | |
;; punctuation | |
(read-punctuation port loc))))))) | |
(define (make-tokenizer port) | |
(let ((div? #f)) | |
(lambda () | |
(let ((tok (next-token port div?))) | |
(set! div? (and (lexical-token? tok) | |
(let ((cat (lexical-token-category tok))) | |
(or (eq? cat 'Identifier) | |
(eq? cat 'NumericLiteral) | |
(eq? cat 'StringLiteral))))) | |
tok)))) | |
(define (make-tokenizer/1 port) | |
(let ((div? #f) | |
(eoi? #f) | |
(stack '())) | |
(lambda () | |
(if eoi? | |
'*eoi* | |
(let ((tok (next-token port div?))) | |
(case (if (lexical-token? tok) (lexical-token-category tok) tok) | |
((lparen) | |
(set! stack (cons tok stack))) | |
((rparen) | |
(if (and (pair? stack) | |
(eq? (lexical-token-category (car stack)) 'lparen)) | |
(set! stack (cdr stack)) | |
(syntax-error "unexpected right parenthesis" | |
(lexical-token-source tok) | |
#f))) | |
((lbracket) | |
(set! stack (cons tok stack))) | |
((rbracket) | |
(if (and (pair? stack) | |
(eq? (lexical-token-category (car stack)) 'lbracket)) | |
(set! stack (cdr stack)) | |
(syntax-error "unexpected right bracket" | |
(lexical-token-source tok) | |
#f))) | |
((lbrace) | |
(set! stack (cons tok stack))) | |
((rbrace) | |
(if (and (pair? stack) | |
(eq? (lexical-token-category (car stack)) 'lbrace)) | |
(set! stack (cdr stack)) | |
(syntax-error "unexpected right brace" | |
(lexical-token-source tok) | |
#f))) | |
((semicolon) | |
(set! eoi? (null? stack)))) | |
(set! div? (and (lexical-token? tok) | |
(let ((cat (lexical-token-category tok))) | |
(or (eq? cat 'Identifier) | |
(eq? cat 'NumericLiteral) | |
(eq? cat 'StringLiteral))))) | |
tok))))) | |
(define (tokenize port) | |
(let ((next (make-tokenizer port))) | |
(let lp ((out '())) | |
(let ((tok (next))) | |
(if (eq? tok '*eoi*) | |
(reverse! out) | |
(lp (cons tok out))))))) | |
(define (tokenize/1 port) | |
(let ((next (make-tokenizer/1 port))) | |
(let lp ((out '())) | |
(let ((tok (next))) | |
(if (eq? tok '*eoi*) | |
(reverse! out) | |
(lp (cons tok out))))))) | |