src/Pure/ML/ml_lex.ML
author wenzelm
Sat, 23 Jul 2011 16:37:17 +0200
changeset 44818 9b00f09f7721
parent 41750 967cbbc77abd
child 45583 fe319b45315c
permissions -rw-r--r--
defer evaluation of Scan.message, for improved performance in the frequent situation where failure is handled later (e.g. via ||);
wenzelm@24579
     1
(*  Title:      Pure/ML/ml_lex.ML
wenzelm@24579
     2
    Author:     Makarius
wenzelm@24579
     3
wenzelm@24579
     4
Lexical syntax for SML.
wenzelm@24579
     5
*)
wenzelm@24579
     6
wenzelm@24579
     7
signature ML_LEX =
wenzelm@24579
     8
sig
wenzelm@24579
     9
  datatype token_kind =
wenzelm@24596
    10
    Keyword | Ident | LongIdent | TypeVar | Word | Int | Real | Char | String |
wenzelm@24579
    11
    Space | Comment | Error of string | EOF
wenzelm@24579
    12
  eqtype token
wenzelm@27732
    13
  val stopper: token Scan.stopper
wenzelm@24596
    14
  val is_regular: token -> bool
wenzelm@24596
    15
  val is_improper: token -> bool
wenzelm@30698
    16
  val set_range: Position.range -> token -> token
wenzelm@30640
    17
  val pos_of: token -> Position.T
wenzelm@31474
    18
  val end_pos_of: token -> Position.T
wenzelm@24579
    19
  val kind_of: token -> token_kind
wenzelm@27817
    20
  val content_of: token -> string
wenzelm@31430
    21
  val check_content_of: token -> string
wenzelm@31366
    22
  val flatten: token list -> string
wenzelm@30614
    23
  val report_token: token -> unit
wenzelm@24579
    24
  val keywords: string list
wenzelm@24596
    25
  val source: (Symbol.symbol, 'a) Source.source ->
wenzelm@30576
    26
    (token, (Symbol_Pos.T, Position.T * (Symbol.symbol, 'a) Source.source)
wenzelm@27772
    27
      Source.source) Source.source
wenzelm@30594
    28
  val tokenize: string -> token list
wenzelm@37199
    29
  val read: Position.T -> Symbol_Pos.text -> token Antiquote.antiquote list
wenzelm@24579
    30
end;
wenzelm@24579
    31
wenzelm@24579
    32
structure ML_Lex: ML_LEX =
wenzelm@24579
    33
struct
wenzelm@24579
    34
wenzelm@24579
    35
(** tokens **)
wenzelm@24579
    36
wenzelm@24579
    37
(* datatype token *)
wenzelm@24579
    38
wenzelm@24579
    39
datatype token_kind =
wenzelm@24596
    40
  Keyword | Ident | LongIdent | TypeVar | Word | Int | Real | Char | String |
wenzelm@24579
    41
  Space | Comment | Error of string | EOF;
wenzelm@24579
    42
wenzelm@27772
    43
datatype token = Token of Position.range * (token_kind * string);
wenzelm@27772
    44
wenzelm@27772
    45
wenzelm@27772
    46
(* position *)
wenzelm@27772
    47
wenzelm@30698
    48
fun set_range range (Token (_, x)) = Token (range, x);
wenzelm@30698
    49
wenzelm@30640
    50
fun pos_of (Token ((pos, _), _)) = pos;
wenzelm@30640
    51
fun end_pos_of (Token ((_, pos), _)) = pos;
wenzelm@24579
    52
wenzelm@24579
    53
wenzelm@24596
    54
(* control tokens *)
wenzelm@24579
    55
wenzelm@27772
    56
fun mk_eof pos = Token ((pos, Position.none), (EOF, ""));
wenzelm@27772
    57
val eof = mk_eof Position.none;
wenzelm@24579
    58
wenzelm@24579
    59
fun is_eof (Token (_, (EOF, _))) = true
wenzelm@24579
    60
  | is_eof _ = false;
wenzelm@24579
    61
wenzelm@27772
    62
val stopper =
wenzelm@30640
    63
  Scan.stopper (fn [] => eof | toks => mk_eof (end_pos_of (List.last toks))) is_eof;
wenzelm@24579
    64
wenzelm@24579
    65
wenzelm@27772
    66
(* token content *)
wenzelm@27772
    67
wenzelm@31430
    68
fun kind_of (Token (_, (k, _))) = k;
wenzelm@31430
    69
wenzelm@27817
    70
fun content_of (Token (_, (_, x))) = x;
wenzelm@27817
    71
fun token_leq (tok, tok') = content_of tok <= content_of tok';
wenzelm@27772
    72
wenzelm@41750
    73
fun is_regular (Token (_, (Error _, _))) = false
wenzelm@41750
    74
  | is_regular (Token (_, (EOF, _))) = false
wenzelm@41750
    75
  | is_regular _ = true;
wenzelm@41750
    76
wenzelm@41750
    77
fun is_improper (Token (_, (Space, _))) = true
wenzelm@41750
    78
  | is_improper (Token (_, (Comment, _))) = true
wenzelm@41750
    79
  | is_improper _ = false;
wenzelm@41750
    80
wenzelm@40599
    81
fun warn tok =
wenzelm@40599
    82
  (case tok of
wenzelm@40599
    83
    Token (_, (Keyword, ":>")) =>
wenzelm@40599
    84
      warning ("Opaque signature matching (:>) fails to work with ML pretty printing --\n\
wenzelm@40599
    85
        \prefer non-opaque matching (:) possibly with abstype" ^
wenzelm@40599
    86
        Position.str_of (pos_of tok))
wenzelm@40599
    87
  | _ => ());
wenzelm@40599
    88
wenzelm@31430
    89
fun check_content_of tok =
wenzelm@30640
    90
  (case kind_of tok of
wenzelm@30649
    91
    Error msg => error msg
wenzelm@40599
    92
  | _ => content_of tok);
wenzelm@30640
    93
wenzelm@41750
    94
fun flatten_content (tok :: (toks as tok' :: _)) =
wenzelm@41750
    95
      Symbol.escape (check_content_of tok) ::
wenzelm@41750
    96
        (if is_improper tok orelse is_improper tok' then flatten_content toks
wenzelm@41750
    97
         else Symbol.space :: flatten_content toks)
wenzelm@41750
    98
  | flatten_content toks = map (Symbol.escape o check_content_of) toks;
wenzelm@31366
    99
wenzelm@41750
   100
val flatten = implode o flatten_content;
wenzelm@24596
   101
wenzelm@24596
   102
wenzelm@30614
   103
(* markup *)
wenzelm@30614
   104
wenzelm@37196
   105
local
wenzelm@37196
   106
wenzelm@30649
   107
val token_kind_markup =
wenzelm@30649
   108
 fn Keyword   => Markup.ML_keyword
wenzelm@30649
   109
  | Ident     => Markup.ML_ident
wenzelm@30649
   110
  | LongIdent => Markup.ML_ident
wenzelm@30649
   111
  | TypeVar   => Markup.ML_tvar
wenzelm@30649
   112
  | Word      => Markup.ML_numeral
wenzelm@30649
   113
  | Int       => Markup.ML_numeral
wenzelm@30649
   114
  | Real      => Markup.ML_numeral
wenzelm@30649
   115
  | Char      => Markup.ML_char
wenzelm@30649
   116
  | String    => Markup.ML_string
wenzelm@38792
   117
  | Space     => Markup.empty
wenzelm@30649
   118
  | Comment   => Markup.ML_comment
wenzelm@30649
   119
  | Error _   => Markup.ML_malformed
wenzelm@38792
   120
  | EOF       => Markup.empty;
wenzelm@30614
   121
wenzelm@37196
   122
fun token_markup kind x =
wenzelm@37196
   123
  if kind = Keyword andalso exists_string (not o Symbol.is_ascii_letter) x
wenzelm@37196
   124
  then Markup.ML_delimiter
wenzelm@37196
   125
  else token_kind_markup kind;
wenzelm@37196
   126
wenzelm@37196
   127
in
wenzelm@37196
   128
wenzelm@39776
   129
fun report_token (Token ((pos, _), (kind, x))) = Position.report pos (token_markup kind x);
wenzelm@37196
   130
wenzelm@37196
   131
end;
wenzelm@30614
   132
wenzelm@30614
   133
wenzelm@24579
   134
wenzelm@24579
   135
(** scanners **)
wenzelm@24579
   136
wenzelm@30576
   137
open Basic_Symbol_Pos;
wenzelm@24579
   138
wenzelm@44818
   139
fun !!! msg = Symbol_Pos.!!! (fn () => "SML lexical error: " ^ msg);
wenzelm@24579
   140
wenzelm@24579
   141
wenzelm@27772
   142
(* blanks *)
wenzelm@24579
   143
wenzelm@40771
   144
val scan_blank = Scan.one (Symbol.is_ascii_blank o Symbol_Pos.symbol);
wenzelm@27772
   145
val scan_blanks1 = Scan.repeat1 scan_blank;
wenzelm@24579
   146
wenzelm@24579
   147
wenzelm@24579
   148
(* keywords *)
wenzelm@24579
   149
wenzelm@24579
   150
val keywords = ["#", "(", ")", ",", "->", "...", ":", ":>", ";", "=",
wenzelm@24579
   151
  "=>", "[", "]", "_", "{", "|", "}", "abstype", "and", "andalso", "as",
wenzelm@24579
   152
  "case", "datatype", "do", "else", "end", "eqtype", "exception", "fn",
wenzelm@24579
   153
  "fun", "functor", "handle", "if", "in", "include", "infix", "infixr",
wenzelm@24579
   154
  "let", "local", "nonfix", "of", "op", "open", "orelse", "raise", "rec",
wenzelm@24579
   155
  "sharing", "sig", "signature", "struct", "structure", "then", "type",
wenzelm@24579
   156
  "val", "where", "while", "with", "withtype"];
wenzelm@24579
   157
wenzelm@40875
   158
val lex = Scan.make_lexicon (map raw_explode keywords);
wenzelm@27772
   159
fun scan_keyword x = Scan.literal lex x;
wenzelm@24579
   160
wenzelm@24579
   161
wenzelm@24579
   162
(* identifiers *)
wenzelm@24579
   163
wenzelm@24596
   164
local
wenzelm@24596
   165
wenzelm@24579
   166
val scan_letdigs =
wenzelm@40771
   167
  Scan.many
wenzelm@40771
   168
    ((Symbol.is_ascii_letter orf Symbol.is_ascii_digit orf Symbol.is_ascii_quasi) o
wenzelm@40771
   169
      Symbol_Pos.symbol);
wenzelm@24579
   170
wenzelm@40771
   171
val scan_alphanumeric =
wenzelm@40771
   172
  Scan.one (Symbol.is_ascii_letter o Symbol_Pos.symbol) -- scan_letdigs >> op ::;
wenzelm@24579
   173
wenzelm@40771
   174
val scan_symbolic =
wenzelm@40875
   175
  Scan.many1 (member (op =) (raw_explode "!#$%&*+-/:<=>?@\\^`|~") o Symbol_Pos.symbol);
wenzelm@24579
   176
wenzelm@24596
   177
in
wenzelm@24596
   178
wenzelm@24579
   179
val scan_ident = scan_alphanumeric || scan_symbolic;
wenzelm@24579
   180
wenzelm@24579
   181
val scan_longident =
wenzelm@27772
   182
  (Scan.repeat1 (scan_alphanumeric @@@ $$$ ".") >> flat) @@@ (scan_ident || $$$ "=");
wenzelm@24579
   183
wenzelm@27772
   184
val scan_typevar = $$$ "'" @@@ scan_letdigs;
wenzelm@24579
   185
wenzelm@24596
   186
end;
wenzelm@24579
   187
wenzelm@24579
   188
wenzelm@24579
   189
(* numerals *)
wenzelm@24579
   190
wenzelm@24596
   191
local
wenzelm@24596
   192
wenzelm@40771
   193
val scan_dec = Scan.many1 (Symbol.is_ascii_digit o Symbol_Pos.symbol);
wenzelm@40771
   194
val scan_hex = Scan.many1 (Symbol.is_ascii_hex o Symbol_Pos.symbol);
wenzelm@27772
   195
val scan_sign = Scan.optional ($$$ "~") [];
wenzelm@27772
   196
val scan_decint = scan_sign @@@ scan_dec;
wenzelm@24579
   197
wenzelm@24596
   198
in
wenzelm@24596
   199
wenzelm@27772
   200
val scan_word =
wenzelm@27772
   201
  $$$ "0" @@@ $$$ "w" @@@ $$$ "x" @@@ scan_hex ||
wenzelm@27772
   202
  $$$ "0" @@@ $$$ "w" @@@ scan_dec;
wenzelm@24579
   203
wenzelm@27772
   204
val scan_int = scan_sign @@@ ($$$ "0" @@@ $$$ "x" @@@ scan_hex || scan_dec);
wenzelm@24579
   205
wenzelm@27772
   206
val scan_exp = ($$$ "E" || $$$ "e") @@@ scan_decint;
wenzelm@24579
   207
wenzelm@24579
   208
val scan_real =
wenzelm@27772
   209
  scan_decint @@@ $$$ "." @@@ scan_dec @@@ Scan.optional scan_exp [] ||
wenzelm@27772
   210
  scan_decint @@@ scan_exp;
wenzelm@24579
   211
wenzelm@24596
   212
end;
wenzelm@24596
   213
wenzelm@24579
   214
wenzelm@24579
   215
(* chars and strings *)
wenzelm@24579
   216
wenzelm@24596
   217
local
wenzelm@24596
   218
wenzelm@24596
   219
val scan_escape =
wenzelm@40875
   220
  Scan.one (member (op =) (raw_explode "\"\\abtnvfr") o Symbol_Pos.symbol) >> single ||
wenzelm@27772
   221
  $$$ "^" @@@ (Scan.one (fn (s, _) => ord "@" <= ord s andalso ord s <= ord "_") >> single) ||
wenzelm@40771
   222
  Scan.one (Symbol.is_ascii_digit o Symbol_Pos.symbol) --
wenzelm@40771
   223
    Scan.one (Symbol.is_ascii_digit o Symbol_Pos.symbol) --
wenzelm@40771
   224
    Scan.one (Symbol.is_ascii_digit o Symbol_Pos.symbol) >> (fn ((a, b), c) => [a, b, c]);
wenzelm@24596
   225
wenzelm@24596
   226
val scan_str =
wenzelm@30600
   227
  Scan.one (fn (s, _) => Symbol.is_regular s andalso s <> "\"" andalso s <> "\\" andalso
wenzelm@30600
   228
    (not (Symbol.is_char s) orelse Symbol.is_printable s)) >> single ||
wenzelm@27772
   229
  $$$ "\\" @@@ !!! "bad escape character in string" scan_escape;
wenzelm@24596
   230
wenzelm@27772
   231
val scan_gap = $$$ "\\" @@@ scan_blanks1 @@@ $$$ "\\";
wenzelm@27772
   232
val scan_gaps = Scan.repeat scan_gap >> flat;
wenzelm@24579
   233
wenzelm@24596
   234
in
wenzelm@24579
   235
wenzelm@24579
   236
val scan_char =
wenzelm@27772
   237
  $$$ "#" @@@ $$$ "\"" @@@ scan_gaps @@@ scan_str @@@ scan_gaps @@@ $$$ "\"";
wenzelm@24579
   238
wenzelm@24579
   239
val scan_string =
wenzelm@27772
   240
  $$$ "\"" @@@ !!! "missing quote at end of string"
wenzelm@27772
   241
    ((Scan.repeat (scan_gap || scan_str) >> flat) @@@ $$$ "\"");
wenzelm@24596
   242
wenzelm@24596
   243
end;
wenzelm@24579
   244
wenzelm@24579
   245
wenzelm@30649
   246
(* scan tokens *)
wenzelm@24579
   247
wenzelm@24579
   248
local
wenzelm@24579
   249
wenzelm@30596
   250
fun token k ss = Token (Symbol_Pos.range ss, (k, Symbol_Pos.content ss));
wenzelm@24579
   251
wenzelm@30596
   252
val scan_ml =
wenzelm@27772
   253
 (scan_char >> token Char ||
wenzelm@27772
   254
  scan_string >> token String ||
wenzelm@27772
   255
  scan_blanks1 >> token Space ||
wenzelm@30576
   256
  Symbol_Pos.scan_comment !!! >> token Comment ||
wenzelm@27772
   257
  Scan.max token_leq
wenzelm@27772
   258
   (scan_keyword >> token Keyword)
wenzelm@27772
   259
   (scan_word >> token Word ||
wenzelm@27772
   260
    scan_real >> token Real ||
wenzelm@27772
   261
    scan_int >> token Int ||
wenzelm@27772
   262
    scan_longident >> token LongIdent ||
wenzelm@27772
   263
    scan_ident >> token Ident ||
wenzelm@27772
   264
    scan_typevar >> token TypeVar));
wenzelm@27772
   265
wenzelm@30649
   266
val scan_antiq = Antiquote.scan || scan_ml >> Antiquote.Text;
wenzelm@30649
   267
wenzelm@27772
   268
fun recover msg =
wenzelm@40771
   269
  Scan.many (((not o Symbol.is_blank) andf Symbol.is_regular) o Symbol_Pos.symbol)
wenzelm@27772
   270
  >> (fn cs => [token (Error msg) cs]);
wenzelm@24579
   271
wenzelm@24579
   272
in
wenzelm@24579
   273
wenzelm@24596
   274
fun source src =
wenzelm@30576
   275
  Symbol_Pos.source (Position.line 1) src
wenzelm@30596
   276
  |> Source.source Symbol_Pos.stopper (Scan.bulk (!!! "bad input" scan_ml)) (SOME (false, recover));
wenzelm@30594
   277
wenzelm@40769
   278
val tokenize = Source.of_string #> Symbol.source #> source #> Source.exhaust;
wenzelm@24579
   279
wenzelm@37199
   280
fun read pos txt =
wenzelm@37199
   281
  let
wenzelm@39776
   282
    val _ = Position.report pos Markup.ML_source;
wenzelm@37199
   283
    val syms = Symbol_Pos.explode (txt, pos);
wenzelm@37214
   284
    val termination =
wenzelm@37214
   285
      if null syms then []
wenzelm@37214
   286
      else
wenzelm@37214
   287
        let
wenzelm@37214
   288
          val pos1 = List.last syms |-> Position.advance;
wenzelm@37214
   289
          val pos2 = Position.advance Symbol.space pos1;
wenzelm@37214
   290
        in [Antiquote.Text (Token (Position.range pos1 pos2, (Space, Symbol.space)))] end;
wenzelm@37214
   291
    val input =
wenzelm@37214
   292
      (Source.of_list syms
wenzelm@37214
   293
        |> Source.source Symbol_Pos.stopper (Scan.bulk (!!! "bad input" scan_antiq))
wenzelm@37214
   294
          (SOME (false, fn msg => recover msg >> map Antiquote.Text))
wenzelm@37214
   295
        |> Source.exhaust
wenzelm@37214
   296
        |> tap (List.app (Antiquote.report report_token))
wenzelm@37214
   297
        |> tap Antiquote.check_nesting
wenzelm@40599
   298
        |> tap (List.app (fn Antiquote.Text tok => (check_content_of tok; warn tok) | _ => ())))
wenzelm@37214
   299
      handle ERROR msg =>
wenzelm@37214
   300
        cat_error msg ("The error(s) above occurred in ML source" ^ Position.str_of pos);
wenzelm@37214
   301
  in input @ termination end;
wenzelm@30649
   302
wenzelm@24579
   303
end;
wenzelm@24579
   304
wenzelm@24596
   305
end;
wenzelm@24579
   306