src/Pure/ML/ml_lex.ML
author wenzelm
Thu, 19 Mar 2009 21:04:53 +0100
changeset 30596 495672058d97
parent 30594 6c9c00ea4ae6
child 30600 de241396389c
permissions -rw-r--r--
added scan_antiq;
more robust scan_ml: plain scanning without cut, regular Symbol_Pos.content instead of Symbol_Pos.implode (which contains spurious Symbol.DEL is used with proper positions);
wenzelm@24579
     1
(*  Title:      Pure/ML/ml_lex.ML
wenzelm@24579
     2
    Author:     Makarius
wenzelm@24579
     3
wenzelm@24579
     4
Lexical syntax for SML.
wenzelm@24579
     5
*)
wenzelm@24579
     6
wenzelm@24579
     7
signature ML_LEX =
wenzelm@24579
     8
sig
wenzelm@24579
     9
  datatype token_kind =
wenzelm@24596
    10
    Keyword | Ident | LongIdent | TypeVar | Word | Int | Real | Char | String |
wenzelm@24579
    11
    Space | Comment | Error of string | EOF
wenzelm@24579
    12
  eqtype token
wenzelm@27732
    13
  val stopper: token Scan.stopper
wenzelm@24596
    14
  val is_regular: token -> bool
wenzelm@24596
    15
  val is_improper: token -> bool
wenzelm@24596
    16
  val pos_of: token -> string
wenzelm@24579
    17
  val kind_of: token -> token_kind
wenzelm@27817
    18
  val content_of: token -> string
wenzelm@24579
    19
  val keywords: string list
wenzelm@30596
    20
  val scan_antiq: Symbol_Pos.T list -> token Antiquote.antiquote * Symbol_Pos.T list
wenzelm@24596
    21
  val source: (Symbol.symbol, 'a) Source.source ->
wenzelm@30576
    22
    (token, (Symbol_Pos.T, Position.T * (Symbol.symbol, 'a) Source.source)
wenzelm@27772
    23
      Source.source) Source.source
wenzelm@30594
    24
  val tokenize: string -> token list
wenzelm@24579
    25
end;
wenzelm@24579
    26
wenzelm@24579
    27
structure ML_Lex: ML_LEX =
wenzelm@24579
    28
struct
wenzelm@24579
    29
wenzelm@24579
    30
(** tokens **)
wenzelm@24579
    31
wenzelm@24579
    32
(* datatype token *)
wenzelm@24579
    33
wenzelm@24579
    34
datatype token_kind =
wenzelm@24596
    35
  Keyword | Ident | LongIdent | TypeVar | Word | Int | Real | Char | String |
wenzelm@24579
    36
  Space | Comment | Error of string | EOF;
wenzelm@24579
    37
wenzelm@27772
    38
datatype token = Token of Position.range * (token_kind * string);
wenzelm@27772
    39
wenzelm@27772
    40
wenzelm@27772
    41
(* position *)
wenzelm@27772
    42
wenzelm@27772
    43
fun position_of (Token ((pos, _), _)) = pos;
wenzelm@27772
    44
fun end_position_of (Token ((_, pos), _)) = pos;
wenzelm@27772
    45
wenzelm@27772
    46
val pos_of = Position.str_of o position_of;
wenzelm@24579
    47
wenzelm@24579
    48
wenzelm@24596
    49
(* control tokens *)
wenzelm@24579
    50
wenzelm@27772
    51
fun mk_eof pos = Token ((pos, Position.none), (EOF, ""));
wenzelm@27772
    52
val eof = mk_eof Position.none;
wenzelm@24579
    53
wenzelm@24579
    54
fun is_eof (Token (_, (EOF, _))) = true
wenzelm@24579
    55
  | is_eof _ = false;
wenzelm@24579
    56
wenzelm@27772
    57
val stopper =
wenzelm@27772
    58
  Scan.stopper (fn [] => eof | toks => mk_eof (end_position_of (List.last toks))) is_eof;
wenzelm@24579
    59
wenzelm@24579
    60
wenzelm@27772
    61
(* token content *)
wenzelm@27772
    62
wenzelm@27817
    63
fun content_of (Token (_, (_, x))) = x;
wenzelm@27817
    64
fun token_leq (tok, tok') = content_of tok <= content_of tok';
wenzelm@27772
    65
wenzelm@27772
    66
fun kind_of (Token (_, (k, _))) = k;
wenzelm@27772
    67
wenzelm@24596
    68
fun is_regular (Token (_, (Error _, _))) = false
wenzelm@24596
    69
  | is_regular (Token (_, (EOF, _))) = false
wenzelm@24596
    70
  | is_regular _ = true;
wenzelm@24596
    71
wenzelm@24596
    72
fun is_improper (Token (_, (Space, _))) = true
wenzelm@24596
    73
  | is_improper (Token (_, (Comment, _))) = true
wenzelm@24596
    74
  | is_improper _ = false;
wenzelm@24596
    75
wenzelm@24596
    76
wenzelm@24579
    77
wenzelm@24579
    78
(** scanners **)
wenzelm@24579
    79
wenzelm@30576
    80
open Basic_Symbol_Pos;
wenzelm@24579
    81
wenzelm@30576
    82
fun !!! msg = Symbol_Pos.!!! ("SML lexical error: " ^ msg);
wenzelm@24579
    83
wenzelm@24579
    84
wenzelm@27772
    85
(* blanks *)
wenzelm@24579
    86
wenzelm@27772
    87
val scan_blank = Scan.one (Symbol.is_ascii_blank o symbol);
wenzelm@27772
    88
val scan_blanks1 = Scan.repeat1 scan_blank;
wenzelm@24579
    89
wenzelm@24579
    90
wenzelm@24579
    91
(* keywords *)
wenzelm@24579
    92
wenzelm@24579
    93
val keywords = ["#", "(", ")", ",", "->", "...", ":", ":>", ";", "=",
wenzelm@24579
    94
  "=>", "[", "]", "_", "{", "|", "}", "abstype", "and", "andalso", "as",
wenzelm@24579
    95
  "case", "datatype", "do", "else", "end", "eqtype", "exception", "fn",
wenzelm@24579
    96
  "fun", "functor", "handle", "if", "in", "include", "infix", "infixr",
wenzelm@24579
    97
  "let", "local", "nonfix", "of", "op", "open", "orelse", "raise", "rec",
wenzelm@24579
    98
  "sharing", "sig", "signature", "struct", "structure", "then", "type",
wenzelm@24579
    99
  "val", "where", "while", "with", "withtype"];
wenzelm@24579
   100
wenzelm@27772
   101
val lex = Scan.make_lexicon (map explode keywords);
wenzelm@27772
   102
fun scan_keyword x = Scan.literal lex x;
wenzelm@24579
   103
wenzelm@24579
   104
wenzelm@24579
   105
(* identifiers *)
wenzelm@24579
   106
wenzelm@24596
   107
local
wenzelm@24596
   108
wenzelm@24579
   109
val scan_letdigs =
wenzelm@27772
   110
  Scan.many ((Symbol.is_ascii_letter orf Symbol.is_ascii_digit orf Symbol.is_ascii_quasi) o symbol);
wenzelm@24579
   111
wenzelm@27772
   112
val scan_alphanumeric = Scan.one (Symbol.is_ascii_letter o symbol) -- scan_letdigs >> op ::;
wenzelm@24579
   113
wenzelm@27772
   114
val scan_symbolic = Scan.many1 (member (op =) (explode "!#$%&*+-/:<=>?@\\^`|~") o symbol);
wenzelm@24579
   115
wenzelm@24596
   116
in
wenzelm@24596
   117
wenzelm@24579
   118
val scan_ident = scan_alphanumeric || scan_symbolic;
wenzelm@24579
   119
wenzelm@24579
   120
val scan_longident =
wenzelm@27772
   121
  (Scan.repeat1 (scan_alphanumeric @@@ $$$ ".") >> flat) @@@ (scan_ident || $$$ "=");
wenzelm@24579
   122
wenzelm@27772
   123
val scan_typevar = $$$ "'" @@@ scan_letdigs;
wenzelm@24579
   124
wenzelm@24596
   125
end;
wenzelm@24579
   126
wenzelm@24579
   127
wenzelm@24579
   128
(* numerals *)
wenzelm@24579
   129
wenzelm@24596
   130
local
wenzelm@24596
   131
wenzelm@27772
   132
val scan_dec = Scan.many1 (Symbol.is_ascii_digit o symbol);
wenzelm@27772
   133
val scan_hex = Scan.many1 (Symbol.is_ascii_hex o symbol);
wenzelm@27772
   134
val scan_sign = Scan.optional ($$$ "~") [];
wenzelm@27772
   135
val scan_decint = scan_sign @@@ scan_dec;
wenzelm@24579
   136
wenzelm@24596
   137
in
wenzelm@24596
   138
wenzelm@27772
   139
val scan_word =
wenzelm@27772
   140
  $$$ "0" @@@ $$$ "w" @@@ $$$ "x" @@@ scan_hex ||
wenzelm@27772
   141
  $$$ "0" @@@ $$$ "w" @@@ scan_dec;
wenzelm@24579
   142
wenzelm@27772
   143
val scan_int = scan_sign @@@ ($$$ "0" @@@ $$$ "x" @@@ scan_hex || scan_dec);
wenzelm@24579
   144
wenzelm@27772
   145
val scan_exp = ($$$ "E" || $$$ "e") @@@ scan_decint;
wenzelm@24579
   146
wenzelm@24579
   147
val scan_real =
wenzelm@27772
   148
  scan_decint @@@ $$$ "." @@@ scan_dec @@@ Scan.optional scan_exp [] ||
wenzelm@27772
   149
  scan_decint @@@ scan_exp;
wenzelm@24579
   150
wenzelm@24596
   151
end;
wenzelm@24596
   152
wenzelm@24579
   153
wenzelm@24579
   154
(* chars and strings *)
wenzelm@24579
   155
wenzelm@24596
   156
local
wenzelm@24596
   157
wenzelm@24596
   158
val scan_escape =
wenzelm@27772
   159
  Scan.one (member (op =) (explode "\"\\abtnvfr") o symbol) >> single ||
wenzelm@27772
   160
  $$$ "^" @@@ (Scan.one (fn (s, _) => ord "@" <= ord s andalso ord s <= ord "_") >> single) ||
wenzelm@27772
   161
  Scan.one (Symbol.is_ascii_digit o symbol) --
wenzelm@27772
   162
    Scan.one (Symbol.is_ascii_digit o symbol) --
wenzelm@27772
   163
    Scan.one (Symbol.is_ascii_digit o symbol) >> (fn ((a, b), c) => [a, b, c]);
wenzelm@24596
   164
wenzelm@24596
   165
val scan_str =
wenzelm@30594
   166
  Scan.one (fn (s, _) => Symbol.is_regular s andalso Symbol.is_printable s
wenzelm@30594
   167
    andalso s <> "\"" andalso s <> "\\") >> single ||
wenzelm@27772
   168
  $$$ "\\" @@@ !!! "bad escape character in string" scan_escape;
wenzelm@24596
   169
wenzelm@27772
   170
val scan_gap = $$$ "\\" @@@ scan_blanks1 @@@ $$$ "\\";
wenzelm@27772
   171
val scan_gaps = Scan.repeat scan_gap >> flat;
wenzelm@24579
   172
wenzelm@24596
   173
in
wenzelm@24579
   174
wenzelm@24579
   175
val scan_char =
wenzelm@27772
   176
  $$$ "#" @@@ $$$ "\"" @@@ scan_gaps @@@ scan_str @@@ scan_gaps @@@ $$$ "\"";
wenzelm@24579
   177
wenzelm@24579
   178
val scan_string =
wenzelm@27772
   179
  $$$ "\"" @@@ !!! "missing quote at end of string"
wenzelm@27772
   180
    ((Scan.repeat (scan_gap || scan_str) >> flat) @@@ $$$ "\"");
wenzelm@24596
   181
wenzelm@24596
   182
end;
wenzelm@24579
   183
wenzelm@24579
   184
wenzelm@24596
   185
(* token source *)
wenzelm@24579
   186
wenzelm@24579
   187
local
wenzelm@24579
   188
wenzelm@30596
   189
fun token k ss = Token (Symbol_Pos.range ss, (k, Symbol_Pos.content ss));
wenzelm@24579
   190
wenzelm@30596
   191
val scan_ml =
wenzelm@27772
   192
 (scan_char >> token Char ||
wenzelm@27772
   193
  scan_string >> token String ||
wenzelm@27772
   194
  scan_blanks1 >> token Space ||
wenzelm@30576
   195
  Symbol_Pos.scan_comment !!! >> token Comment ||
wenzelm@27772
   196
  Scan.max token_leq
wenzelm@27772
   197
   (scan_keyword >> token Keyword)
wenzelm@27772
   198
   (scan_word >> token Word ||
wenzelm@27772
   199
    scan_real >> token Real ||
wenzelm@27772
   200
    scan_int >> token Int ||
wenzelm@27772
   201
    scan_longident >> token LongIdent ||
wenzelm@27772
   202
    scan_ident >> token Ident ||
wenzelm@27772
   203
    scan_typevar >> token TypeVar));
wenzelm@27772
   204
wenzelm@27772
   205
fun recover msg =
wenzelm@27772
   206
  Scan.many (((not o Symbol.is_blank) andf Symbol.is_regular) o symbol)
wenzelm@27772
   207
  >> (fn cs => [token (Error msg) cs]);
wenzelm@24579
   208
wenzelm@24579
   209
in
wenzelm@24579
   210
wenzelm@30596
   211
val scan_antiq = Antiquote.scan || scan_ml >> Antiquote.Text;
wenzelm@30596
   212
wenzelm@24596
   213
fun source src =
wenzelm@30576
   214
  Symbol_Pos.source (Position.line 1) src
wenzelm@30596
   215
  |> Source.source Symbol_Pos.stopper (Scan.bulk (!!! "bad input" scan_ml)) (SOME (false, recover));
wenzelm@30594
   216
wenzelm@30594
   217
val tokenize = Source.of_string #> source #> Source.exhaust;
wenzelm@24579
   218
wenzelm@24579
   219
end;
wenzelm@24579
   220
wenzelm@24596
   221
end;
wenzelm@24579
   222