1 (* Title: Pure/ML/ml_lex.ML
4 Lexical syntax for SML.
10 Keyword | Ident | LongIdent | TypeVar | Word | Int | Real | Char | String |
11 Space | Comment | Error of string | EOF
13 val stopper: token Scan.stopper
14 val is_regular: token -> bool
15 val is_improper: token -> bool
16 val set_range: Position.range -> token -> token
17 val pos_of: token -> Position.T
18 val end_pos_of: token -> Position.T
19 val kind_of: token -> token_kind
20 val content_of: token -> string
21 val check_content_of: token -> string
22 val flatten: token list -> string
23 val report_token: token -> unit
24 val keywords: string list
25 val source: (Symbol.symbol, 'a) Source.source ->
26 (token, (Symbol_Pos.T, Position.T * (Symbol.symbol, 'a) Source.source)
27 Source.source) Source.source
28 val tokenize: string -> token list
29 val read_antiq: Symbol_Pos.T list * Position.T -> token Antiquote.antiquote list
32 structure ML_Lex: ML_LEX =
40 Keyword | Ident | LongIdent | TypeVar | Word | Int | Real | Char | String |
41 Space | Comment | Error of string | EOF;
43 datatype token = Token of Position.range * (token_kind * string);
48 fun set_range range (Token (_, x)) = Token (range, x);
50 fun pos_of (Token ((pos, _), _)) = pos;
51 fun end_pos_of (Token ((_, pos), _)) = pos;
56 fun mk_eof pos = Token ((pos, Position.none), (EOF, ""));
57 val eof = mk_eof Position.none;
59 fun is_eof (Token (_, (EOF, _))) = true
63 Scan.stopper (fn [] => eof | toks => mk_eof (end_pos_of (List.last toks))) is_eof;
68 fun kind_of (Token (_, (k, _))) = k;
70 fun content_of (Token (_, (_, x))) = x;
71 fun token_leq (tok, tok') = content_of tok <= content_of tok';
73 fun check_content_of tok =
75 Error msg => error msg
76 | _ => content_of tok);
78 val flatten = implode o map (Symbol.escape o check_content_of);
80 fun is_regular (Token (_, (Error _, _))) = false
81 | is_regular (Token (_, (EOF, _))) = false
82 | is_regular _ = true;
84 fun is_improper (Token (_, (Space, _))) = true
85 | is_improper (Token (_, (Comment, _))) = true
86 | is_improper _ = false;
91 val token_kind_markup =
92 fn Keyword => Markup.ML_keyword
93 | Ident => Markup.ML_ident
94 | LongIdent => Markup.ML_ident
95 | TypeVar => Markup.ML_tvar
96 | Word => Markup.ML_numeral
97 | Int => Markup.ML_numeral
98 | Real => Markup.ML_numeral
99 | Char => Markup.ML_char
100 | String => Markup.ML_string
101 | Space => Markup.none
102 | Comment => Markup.ML_comment
103 | Error _ => Markup.ML_malformed
104 | EOF => Markup.none;
106 fun report_token (Token ((pos, _), (kind, _))) =
107 Position.report (token_kind_markup kind) pos;
113 open Basic_Symbol_Pos;
115 fun !!! msg = Symbol_Pos.!!! ("SML lexical error: " ^ msg);
120 val scan_blank = Scan.one (Symbol.is_ascii_blank o symbol);
121 val scan_blanks1 = Scan.repeat1 scan_blank;
126 val keywords = ["#", "(", ")", ",", "->", "...", ":", ":>", ";", "=",
127 "=>", "[", "]", "_", "{", "|", "}", "abstype", "and", "andalso", "as",
128 "case", "datatype", "do", "else", "end", "eqtype", "exception", "fn",
129 "fun", "functor", "handle", "if", "in", "include", "infix", "infixr",
130 "let", "local", "nonfix", "of", "op", "open", "orelse", "raise", "rec",
131 "sharing", "sig", "signature", "struct", "structure", "then", "type",
132 "val", "where", "while", "with", "withtype"];
134 val lex = Scan.make_lexicon (map explode keywords);
135 fun scan_keyword x = Scan.literal lex x;
143 Scan.many ((Symbol.is_ascii_letter orf Symbol.is_ascii_digit orf Symbol.is_ascii_quasi) o symbol);
145 val scan_alphanumeric = Scan.one (Symbol.is_ascii_letter o symbol) -- scan_letdigs >> op ::;
147 val scan_symbolic = Scan.many1 (member (op =) (explode "!#$%&*+-/:<=>?@\\^`|~") o symbol);
151 val scan_ident = scan_alphanumeric || scan_symbolic;
154 (Scan.repeat1 (scan_alphanumeric @@@ $$$ ".") >> flat) @@@ (scan_ident || $$$ "=");
156 val scan_typevar = $$$ "'" @@@ scan_letdigs;
165 val scan_dec = Scan.many1 (Symbol.is_ascii_digit o symbol);
166 val scan_hex = Scan.many1 (Symbol.is_ascii_hex o symbol);
167 val scan_sign = Scan.optional ($$$ "~") [];
168 val scan_decint = scan_sign @@@ scan_dec;
173 $$$ "0" @@@ $$$ "w" @@@ $$$ "x" @@@ scan_hex ||
174 $$$ "0" @@@ $$$ "w" @@@ scan_dec;
176 val scan_int = scan_sign @@@ ($$$ "0" @@@ $$$ "x" @@@ scan_hex || scan_dec);
178 val scan_exp = ($$$ "E" || $$$ "e") @@@ scan_decint;
181 scan_decint @@@ $$$ "." @@@ scan_dec @@@ Scan.optional scan_exp [] ||
182 scan_decint @@@ scan_exp;
187 (* chars and strings *)
192 Scan.one (member (op =) (explode "\"\\abtnvfr") o symbol) >> single ||
193 $$$ "^" @@@ (Scan.one (fn (s, _) => ord "@" <= ord s andalso ord s <= ord "_") >> single) ||
194 Scan.one (Symbol.is_ascii_digit o symbol) --
195 Scan.one (Symbol.is_ascii_digit o symbol) --
196 Scan.one (Symbol.is_ascii_digit o symbol) >> (fn ((a, b), c) => [a, b, c]);
199 Scan.one (fn (s, _) => Symbol.is_regular s andalso s <> "\"" andalso s <> "\\" andalso
200 (not (Symbol.is_char s) orelse Symbol.is_printable s)) >> single ||
201 $$$ "\\" @@@ !!! "bad escape character in string" scan_escape;
203 val scan_gap = $$$ "\\" @@@ scan_blanks1 @@@ $$$ "\\";
204 val scan_gaps = Scan.repeat scan_gap >> flat;
209 $$$ "#" @@@ $$$ "\"" @@@ scan_gaps @@@ scan_str @@@ scan_gaps @@@ $$$ "\"";
212 $$$ "\"" @@@ !!! "missing quote at end of string"
213 ((Scan.repeat (scan_gap || scan_str) >> flat) @@@ $$$ "\"");
222 fun token k ss = Token (Symbol_Pos.range ss, (k, Symbol_Pos.content ss));
225 (scan_char >> token Char ||
226 scan_string >> token String ||
227 scan_blanks1 >> token Space ||
228 Symbol_Pos.scan_comment !!! >> token Comment ||
230 (scan_keyword >> token Keyword)
231 (scan_word >> token Word ||
232 scan_real >> token Real ||
233 scan_int >> token Int ||
234 scan_longident >> token LongIdent ||
235 scan_ident >> token Ident ||
236 scan_typevar >> token TypeVar));
238 val scan_antiq = Antiquote.scan || scan_ml >> Antiquote.Text;
241 Scan.many (((not o Symbol.is_blank) andf Symbol.is_regular) o symbol)
242 >> (fn cs => [token (Error msg) cs]);
247 Symbol_Pos.source (Position.line 1) src
248 |> Source.source Symbol_Pos.stopper (Scan.bulk (!!! "bad input" scan_ml)) (SOME (false, recover));
250 val tokenize = Source.of_string #> source #> Source.exhaust;
252 fun read_antiq (syms, pos) =
254 |> Source.source Symbol_Pos.stopper (Scan.bulk (!!! "bad input" scan_antiq))
255 (SOME (false, fn msg => recover msg >> map Antiquote.Text))
257 |> tap (List.app (Antiquote.report report_token))
258 |> tap Antiquote.check_nesting
259 |> tap (List.app (fn Antiquote.Text tok => ignore (check_content_of tok) | _ => ())))
261 cat_error msg ("The error(s) above occurred in ML source" ^ Position.str_of pos);