src/Pure/ML/ml_lex.ML
author wenzelm
Sat, 23 Jul 2011 16:37:17 +0200
changeset 44818 9b00f09f7721
parent 41750 967cbbc77abd
child 45583 fe319b45315c
permissions -rw-r--r--
defer evaluation of Scan.message, for improved performance in the frequent situation where failure is handled later (e.g. via ||);
     1 (*  Title:      Pure/ML/ml_lex.ML
     2     Author:     Makarius
     3 
     4 Lexical syntax for SML.
     5 *)
     6 
     7 signature ML_LEX =
     8 sig
     9   datatype token_kind =
    10     Keyword | Ident | LongIdent | TypeVar | Word | Int | Real | Char | String |
    11     Space | Comment | Error of string | EOF
    12   eqtype token
    13   val stopper: token Scan.stopper
    14   val is_regular: token -> bool
    15   val is_improper: token -> bool
    16   val set_range: Position.range -> token -> token
    17   val pos_of: token -> Position.T
    18   val end_pos_of: token -> Position.T
    19   val kind_of: token -> token_kind
    20   val content_of: token -> string
    21   val check_content_of: token -> string
    22   val flatten: token list -> string
    23   val report_token: token -> unit
    24   val keywords: string list
    25   val source: (Symbol.symbol, 'a) Source.source ->
    26     (token, (Symbol_Pos.T, Position.T * (Symbol.symbol, 'a) Source.source)
    27       Source.source) Source.source
    28   val tokenize: string -> token list
    29   val read: Position.T -> Symbol_Pos.text -> token Antiquote.antiquote list
    30 end;
    31 
    32 structure ML_Lex: ML_LEX =
    33 struct
    34 
    35 (** tokens **)
    36 
    37 (* datatype token *)
    38 
    39 datatype token_kind =
    40   Keyword | Ident | LongIdent | TypeVar | Word | Int | Real | Char | String |
    41   Space | Comment | Error of string | EOF;
    42 
    43 datatype token = Token of Position.range * (token_kind * string);
    44 
    45 
    46 (* position *)
    47 
    48 fun set_range range (Token (_, x)) = Token (range, x);
    49 
    50 fun pos_of (Token ((pos, _), _)) = pos;
    51 fun end_pos_of (Token ((_, pos), _)) = pos;
    52 
    53 
    54 (* control tokens *)
    55 
    56 fun mk_eof pos = Token ((pos, Position.none), (EOF, ""));
    57 val eof = mk_eof Position.none;
    58 
    59 fun is_eof (Token (_, (EOF, _))) = true
    60   | is_eof _ = false;
    61 
    62 val stopper =
    63   Scan.stopper (fn [] => eof | toks => mk_eof (end_pos_of (List.last toks))) is_eof;
    64 
    65 
    66 (* token content *)
    67 
    68 fun kind_of (Token (_, (k, _))) = k;
    69 
    70 fun content_of (Token (_, (_, x))) = x;
    71 fun token_leq (tok, tok') = content_of tok <= content_of tok';
    72 
    73 fun is_regular (Token (_, (Error _, _))) = false
    74   | is_regular (Token (_, (EOF, _))) = false
    75   | is_regular _ = true;
    76 
    77 fun is_improper (Token (_, (Space, _))) = true
    78   | is_improper (Token (_, (Comment, _))) = true
    79   | is_improper _ = false;
    80 
    81 fun warn tok =
    82   (case tok of
    83     Token (_, (Keyword, ":>")) =>
    84       warning ("Opaque signature matching (:>) fails to work with ML pretty printing --\n\
    85         \prefer non-opaque matching (:) possibly with abstype" ^
    86         Position.str_of (pos_of tok))
    87   | _ => ());
    88 
    89 fun check_content_of tok =
    90   (case kind_of tok of
    91     Error msg => error msg
    92   | _ => content_of tok);
    93 
    94 fun flatten_content (tok :: (toks as tok' :: _)) =
    95       Symbol.escape (check_content_of tok) ::
    96         (if is_improper tok orelse is_improper tok' then flatten_content toks
    97          else Symbol.space :: flatten_content toks)
    98   | flatten_content toks = map (Symbol.escape o check_content_of) toks;
    99 
   100 val flatten = implode o flatten_content;
   101 
   102 
   103 (* markup *)
   104 
   105 local
   106 
   107 val token_kind_markup =
   108  fn Keyword   => Markup.ML_keyword
   109   | Ident     => Markup.ML_ident
   110   | LongIdent => Markup.ML_ident
   111   | TypeVar   => Markup.ML_tvar
   112   | Word      => Markup.ML_numeral
   113   | Int       => Markup.ML_numeral
   114   | Real      => Markup.ML_numeral
   115   | Char      => Markup.ML_char
   116   | String    => Markup.ML_string
   117   | Space     => Markup.empty
   118   | Comment   => Markup.ML_comment
   119   | Error _   => Markup.ML_malformed
   120   | EOF       => Markup.empty;
   121 
   122 fun token_markup kind x =
   123   if kind = Keyword andalso exists_string (not o Symbol.is_ascii_letter) x
   124   then Markup.ML_delimiter
   125   else token_kind_markup kind;
   126 
   127 in
   128 
   129 fun report_token (Token ((pos, _), (kind, x))) = Position.report pos (token_markup kind x);
   130 
   131 end;
   132 
   133 
   134 
   135 (** scanners **)
   136 
   137 open Basic_Symbol_Pos;
   138 
   139 fun !!! msg = Symbol_Pos.!!! (fn () => "SML lexical error: " ^ msg);
   140 
   141 
   142 (* blanks *)
   143 
   144 val scan_blank = Scan.one (Symbol.is_ascii_blank o Symbol_Pos.symbol);
   145 val scan_blanks1 = Scan.repeat1 scan_blank;
   146 
   147 
   148 (* keywords *)
   149 
   150 val keywords = ["#", "(", ")", ",", "->", "...", ":", ":>", ";", "=",
   151   "=>", "[", "]", "_", "{", "|", "}", "abstype", "and", "andalso", "as",
   152   "case", "datatype", "do", "else", "end", "eqtype", "exception", "fn",
   153   "fun", "functor", "handle", "if", "in", "include", "infix", "infixr",
   154   "let", "local", "nonfix", "of", "op", "open", "orelse", "raise", "rec",
   155   "sharing", "sig", "signature", "struct", "structure", "then", "type",
   156   "val", "where", "while", "with", "withtype"];
   157 
   158 val lex = Scan.make_lexicon (map raw_explode keywords);
   159 fun scan_keyword x = Scan.literal lex x;
   160 
   161 
   162 (* identifiers *)
   163 
   164 local
   165 
   166 val scan_letdigs =
   167   Scan.many
   168     ((Symbol.is_ascii_letter orf Symbol.is_ascii_digit orf Symbol.is_ascii_quasi) o
   169       Symbol_Pos.symbol);
   170 
   171 val scan_alphanumeric =
   172   Scan.one (Symbol.is_ascii_letter o Symbol_Pos.symbol) -- scan_letdigs >> op ::;
   173 
   174 val scan_symbolic =
   175   Scan.many1 (member (op =) (raw_explode "!#$%&*+-/:<=>?@\\^`|~") o Symbol_Pos.symbol);
   176 
   177 in
   178 
   179 val scan_ident = scan_alphanumeric || scan_symbolic;
   180 
   181 val scan_longident =
   182   (Scan.repeat1 (scan_alphanumeric @@@ $$$ ".") >> flat) @@@ (scan_ident || $$$ "=");
   183 
   184 val scan_typevar = $$$ "'" @@@ scan_letdigs;
   185 
   186 end;
   187 
   188 
   189 (* numerals *)
   190 
   191 local
   192 
   193 val scan_dec = Scan.many1 (Symbol.is_ascii_digit o Symbol_Pos.symbol);
   194 val scan_hex = Scan.many1 (Symbol.is_ascii_hex o Symbol_Pos.symbol);
   195 val scan_sign = Scan.optional ($$$ "~") [];
   196 val scan_decint = scan_sign @@@ scan_dec;
   197 
   198 in
   199 
   200 val scan_word =
   201   $$$ "0" @@@ $$$ "w" @@@ $$$ "x" @@@ scan_hex ||
   202   $$$ "0" @@@ $$$ "w" @@@ scan_dec;
   203 
   204 val scan_int = scan_sign @@@ ($$$ "0" @@@ $$$ "x" @@@ scan_hex || scan_dec);
   205 
   206 val scan_exp = ($$$ "E" || $$$ "e") @@@ scan_decint;
   207 
   208 val scan_real =
   209   scan_decint @@@ $$$ "." @@@ scan_dec @@@ Scan.optional scan_exp [] ||
   210   scan_decint @@@ scan_exp;
   211 
   212 end;
   213 
   214 
   215 (* chars and strings *)
   216 
   217 local
   218 
   219 val scan_escape =
   220   Scan.one (member (op =) (raw_explode "\"\\abtnvfr") o Symbol_Pos.symbol) >> single ||
   221   $$$ "^" @@@ (Scan.one (fn (s, _) => ord "@" <= ord s andalso ord s <= ord "_") >> single) ||
   222   Scan.one (Symbol.is_ascii_digit o Symbol_Pos.symbol) --
   223     Scan.one (Symbol.is_ascii_digit o Symbol_Pos.symbol) --
   224     Scan.one (Symbol.is_ascii_digit o Symbol_Pos.symbol) >> (fn ((a, b), c) => [a, b, c]);
   225 
   226 val scan_str =
   227   Scan.one (fn (s, _) => Symbol.is_regular s andalso s <> "\"" andalso s <> "\\" andalso
   228     (not (Symbol.is_char s) orelse Symbol.is_printable s)) >> single ||
   229   $$$ "\\" @@@ !!! "bad escape character in string" scan_escape;
   230 
   231 val scan_gap = $$$ "\\" @@@ scan_blanks1 @@@ $$$ "\\";
   232 val scan_gaps = Scan.repeat scan_gap >> flat;
   233 
   234 in
   235 
   236 val scan_char =
   237   $$$ "#" @@@ $$$ "\"" @@@ scan_gaps @@@ scan_str @@@ scan_gaps @@@ $$$ "\"";
   238 
   239 val scan_string =
   240   $$$ "\"" @@@ !!! "missing quote at end of string"
   241     ((Scan.repeat (scan_gap || scan_str) >> flat) @@@ $$$ "\"");
   242 
   243 end;
   244 
   245 
   246 (* scan tokens *)
   247 
   248 local
   249 
   250 fun token k ss = Token (Symbol_Pos.range ss, (k, Symbol_Pos.content ss));
   251 
   252 val scan_ml =
   253  (scan_char >> token Char ||
   254   scan_string >> token String ||
   255   scan_blanks1 >> token Space ||
   256   Symbol_Pos.scan_comment !!! >> token Comment ||
   257   Scan.max token_leq
   258    (scan_keyword >> token Keyword)
   259    (scan_word >> token Word ||
   260     scan_real >> token Real ||
   261     scan_int >> token Int ||
   262     scan_longident >> token LongIdent ||
   263     scan_ident >> token Ident ||
   264     scan_typevar >> token TypeVar));
   265 
   266 val scan_antiq = Antiquote.scan || scan_ml >> Antiquote.Text;
   267 
   268 fun recover msg =
   269   Scan.many (((not o Symbol.is_blank) andf Symbol.is_regular) o Symbol_Pos.symbol)
   270   >> (fn cs => [token (Error msg) cs]);
   271 
   272 in
   273 
   274 fun source src =
   275   Symbol_Pos.source (Position.line 1) src
   276   |> Source.source Symbol_Pos.stopper (Scan.bulk (!!! "bad input" scan_ml)) (SOME (false, recover));
   277 
   278 val tokenize = Source.of_string #> Symbol.source #> source #> Source.exhaust;
   279 
   280 fun read pos txt =
   281   let
   282     val _ = Position.report pos Markup.ML_source;
   283     val syms = Symbol_Pos.explode (txt, pos);
   284     val termination =
   285       if null syms then []
   286       else
   287         let
   288           val pos1 = List.last syms |-> Position.advance;
   289           val pos2 = Position.advance Symbol.space pos1;
   290         in [Antiquote.Text (Token (Position.range pos1 pos2, (Space, Symbol.space)))] end;
   291     val input =
   292       (Source.of_list syms
   293         |> Source.source Symbol_Pos.stopper (Scan.bulk (!!! "bad input" scan_antiq))
   294           (SOME (false, fn msg => recover msg >> map Antiquote.Text))
   295         |> Source.exhaust
   296         |> tap (List.app (Antiquote.report report_token))
   297         |> tap Antiquote.check_nesting
   298         |> tap (List.app (fn Antiquote.Text tok => (check_content_of tok; warn tok) | _ => ())))
   299       handle ERROR msg =>
   300         cat_error msg ("The error(s) above occurred in ML source" ^ Position.str_of pos);
   301   in input @ termination end;
   302 
   303 end;
   304 
   305 end;
   306