src/Pure/ML/ml_lex.ML
author wenzelm
Sat, 06 Jun 2009 19:58:11 +0200
changeset 31474 0ae32184bde0
parent 31430 5c9dbd511510
child 31546 5bef6c7cc819
permissions -rw-r--r--
export end_pos_of;
     1 (*  Title:      Pure/ML/ml_lex.ML
     2     Author:     Makarius
     3 
     4 Lexical syntax for SML.
     5 *)
     6 
     7 signature ML_LEX =
     8 sig
     9   datatype token_kind =
    10     Keyword | Ident | LongIdent | TypeVar | Word | Int | Real | Char | String |
    11     Space | Comment | Error of string | EOF
    12   eqtype token
    13   val stopper: token Scan.stopper
    14   val is_regular: token -> bool
    15   val is_improper: token -> bool
    16   val set_range: Position.range -> token -> token
    17   val pos_of: token -> Position.T
    18   val end_pos_of: token -> Position.T
    19   val kind_of: token -> token_kind
    20   val content_of: token -> string
    21   val check_content_of: token -> string
    22   val flatten: token list -> string
    23   val report_token: token -> unit
    24   val keywords: string list
    25   val source: (Symbol.symbol, 'a) Source.source ->
    26     (token, (Symbol_Pos.T, Position.T * (Symbol.symbol, 'a) Source.source)
    27       Source.source) Source.source
    28   val tokenize: string -> token list
    29   val read_antiq: Symbol_Pos.T list * Position.T -> token Antiquote.antiquote list
    30 end;
    31 
    32 structure ML_Lex: ML_LEX =
    33 struct
    34 
    35 (** tokens **)
    36 
    37 (* datatype token *)
    38 
    39 datatype token_kind =
    40   Keyword | Ident | LongIdent | TypeVar | Word | Int | Real | Char | String |
    41   Space | Comment | Error of string | EOF;
    42 
    43 datatype token = Token of Position.range * (token_kind * string);
    44 
    45 
    46 (* position *)
    47 
    48 fun set_range range (Token (_, x)) = Token (range, x);
    49 
    50 fun pos_of (Token ((pos, _), _)) = pos;
    51 fun end_pos_of (Token ((_, pos), _)) = pos;
    52 
    53 
    54 (* control tokens *)
    55 
    56 fun mk_eof pos = Token ((pos, Position.none), (EOF, ""));
    57 val eof = mk_eof Position.none;
    58 
    59 fun is_eof (Token (_, (EOF, _))) = true
    60   | is_eof _ = false;
    61 
    62 val stopper =
    63   Scan.stopper (fn [] => eof | toks => mk_eof (end_pos_of (List.last toks))) is_eof;
    64 
    65 
    66 (* token content *)
    67 
    68 fun kind_of (Token (_, (k, _))) = k;
    69 
    70 fun content_of (Token (_, (_, x))) = x;
    71 fun token_leq (tok, tok') = content_of tok <= content_of tok';
    72 
    73 fun check_content_of tok =
    74   (case kind_of tok of
    75     Error msg => error msg
    76   | _ => content_of tok);
    77 
    78 val flatten = implode o map (Symbol.escape o check_content_of);
    79 
    80 fun is_regular (Token (_, (Error _, _))) = false
    81   | is_regular (Token (_, (EOF, _))) = false
    82   | is_regular _ = true;
    83 
    84 fun is_improper (Token (_, (Space, _))) = true
    85   | is_improper (Token (_, (Comment, _))) = true
    86   | is_improper _ = false;
    87 
    88 
    89 (* markup *)
    90 
    91 val token_kind_markup =
    92  fn Keyword   => Markup.ML_keyword
    93   | Ident     => Markup.ML_ident
    94   | LongIdent => Markup.ML_ident
    95   | TypeVar   => Markup.ML_tvar
    96   | Word      => Markup.ML_numeral
    97   | Int       => Markup.ML_numeral
    98   | Real      => Markup.ML_numeral
    99   | Char      => Markup.ML_char
   100   | String    => Markup.ML_string
   101   | Space     => Markup.none
   102   | Comment   => Markup.ML_comment
   103   | Error _   => Markup.ML_malformed
   104   | EOF       => Markup.none;
   105 
   106 fun report_token (Token ((pos, _), (kind, _))) =
   107   Position.report (token_kind_markup kind) pos;
   108 
   109 
   110 
   111 (** scanners **)
   112 
   113 open Basic_Symbol_Pos;
   114 
   115 fun !!! msg = Symbol_Pos.!!! ("SML lexical error: " ^ msg);
   116 
   117 
   118 (* blanks *)
   119 
   120 val scan_blank = Scan.one (Symbol.is_ascii_blank o symbol);
   121 val scan_blanks1 = Scan.repeat1 scan_blank;
   122 
   123 
   124 (* keywords *)
   125 
   126 val keywords = ["#", "(", ")", ",", "->", "...", ":", ":>", ";", "=",
   127   "=>", "[", "]", "_", "{", "|", "}", "abstype", "and", "andalso", "as",
   128   "case", "datatype", "do", "else", "end", "eqtype", "exception", "fn",
   129   "fun", "functor", "handle", "if", "in", "include", "infix", "infixr",
   130   "let", "local", "nonfix", "of", "op", "open", "orelse", "raise", "rec",
   131   "sharing", "sig", "signature", "struct", "structure", "then", "type",
   132   "val", "where", "while", "with", "withtype"];
   133 
   134 val lex = Scan.make_lexicon (map explode keywords);
   135 fun scan_keyword x = Scan.literal lex x;
   136 
   137 
   138 (* identifiers *)
   139 
   140 local
   141 
   142 val scan_letdigs =
   143   Scan.many ((Symbol.is_ascii_letter orf Symbol.is_ascii_digit orf Symbol.is_ascii_quasi) o symbol);
   144 
   145 val scan_alphanumeric = Scan.one (Symbol.is_ascii_letter o symbol) -- scan_letdigs >> op ::;
   146 
   147 val scan_symbolic = Scan.many1 (member (op =) (explode "!#$%&*+-/:<=>?@\\^`|~") o symbol);
   148 
   149 in
   150 
   151 val scan_ident = scan_alphanumeric || scan_symbolic;
   152 
   153 val scan_longident =
   154   (Scan.repeat1 (scan_alphanumeric @@@ $$$ ".") >> flat) @@@ (scan_ident || $$$ "=");
   155 
   156 val scan_typevar = $$$ "'" @@@ scan_letdigs;
   157 
   158 end;
   159 
   160 
   161 (* numerals *)
   162 
   163 local
   164 
   165 val scan_dec = Scan.many1 (Symbol.is_ascii_digit o symbol);
   166 val scan_hex = Scan.many1 (Symbol.is_ascii_hex o symbol);
   167 val scan_sign = Scan.optional ($$$ "~") [];
   168 val scan_decint = scan_sign @@@ scan_dec;
   169 
   170 in
   171 
   172 val scan_word =
   173   $$$ "0" @@@ $$$ "w" @@@ $$$ "x" @@@ scan_hex ||
   174   $$$ "0" @@@ $$$ "w" @@@ scan_dec;
   175 
   176 val scan_int = scan_sign @@@ ($$$ "0" @@@ $$$ "x" @@@ scan_hex || scan_dec);
   177 
   178 val scan_exp = ($$$ "E" || $$$ "e") @@@ scan_decint;
   179 
   180 val scan_real =
   181   scan_decint @@@ $$$ "." @@@ scan_dec @@@ Scan.optional scan_exp [] ||
   182   scan_decint @@@ scan_exp;
   183 
   184 end;
   185 
   186 
   187 (* chars and strings *)
   188 
   189 local
   190 
   191 val scan_escape =
   192   Scan.one (member (op =) (explode "\"\\abtnvfr") o symbol) >> single ||
   193   $$$ "^" @@@ (Scan.one (fn (s, _) => ord "@" <= ord s andalso ord s <= ord "_") >> single) ||
   194   Scan.one (Symbol.is_ascii_digit o symbol) --
   195     Scan.one (Symbol.is_ascii_digit o symbol) --
   196     Scan.one (Symbol.is_ascii_digit o symbol) >> (fn ((a, b), c) => [a, b, c]);
   197 
   198 val scan_str =
   199   Scan.one (fn (s, _) => Symbol.is_regular s andalso s <> "\"" andalso s <> "\\" andalso
   200     (not (Symbol.is_char s) orelse Symbol.is_printable s)) >> single ||
   201   $$$ "\\" @@@ !!! "bad escape character in string" scan_escape;
   202 
   203 val scan_gap = $$$ "\\" @@@ scan_blanks1 @@@ $$$ "\\";
   204 val scan_gaps = Scan.repeat scan_gap >> flat;
   205 
   206 in
   207 
   208 val scan_char =
   209   $$$ "#" @@@ $$$ "\"" @@@ scan_gaps @@@ scan_str @@@ scan_gaps @@@ $$$ "\"";
   210 
   211 val scan_string =
   212   $$$ "\"" @@@ !!! "missing quote at end of string"
   213     ((Scan.repeat (scan_gap || scan_str) >> flat) @@@ $$$ "\"");
   214 
   215 end;
   216 
   217 
   218 (* scan tokens *)
   219 
   220 local
   221 
   222 fun token k ss = Token (Symbol_Pos.range ss, (k, Symbol_Pos.content ss));
   223 
   224 val scan_ml =
   225  (scan_char >> token Char ||
   226   scan_string >> token String ||
   227   scan_blanks1 >> token Space ||
   228   Symbol_Pos.scan_comment !!! >> token Comment ||
   229   Scan.max token_leq
   230    (scan_keyword >> token Keyword)
   231    (scan_word >> token Word ||
   232     scan_real >> token Real ||
   233     scan_int >> token Int ||
   234     scan_longident >> token LongIdent ||
   235     scan_ident >> token Ident ||
   236     scan_typevar >> token TypeVar));
   237 
   238 val scan_antiq = Antiquote.scan || scan_ml >> Antiquote.Text;
   239 
   240 fun recover msg =
   241   Scan.many (((not o Symbol.is_blank) andf Symbol.is_regular) o symbol)
   242   >> (fn cs => [token (Error msg) cs]);
   243 
   244 in
   245 
   246 fun source src =
   247   Symbol_Pos.source (Position.line 1) src
   248   |> Source.source Symbol_Pos.stopper (Scan.bulk (!!! "bad input" scan_ml)) (SOME (false, recover));
   249 
   250 val tokenize = Source.of_string #> source #> Source.exhaust;
   251 
   252 fun read_antiq (syms, pos) =
   253   (Source.of_list syms
   254     |> Source.source Symbol_Pos.stopper (Scan.bulk (!!! "bad input" scan_antiq))
   255       (SOME (false, fn msg => recover msg >> map Antiquote.Text))
   256     |> Source.exhaust
   257     |> tap (List.app (Antiquote.report report_token))
   258     |> tap Antiquote.check_nesting
   259     |> tap (List.app (fn Antiquote.Text tok => ignore (check_content_of tok) | _ => ())))
   260   handle ERROR msg =>
   261     cat_error msg ("The error(s) above occurred in ML source" ^ Position.str_of pos);
   262 
   263 end;
   264 
   265 end;
   266