wenzelm@24579
|
1 |
(* Title: Pure/ML/ml_lex.ML
|
wenzelm@24579
|
2 |
Author: Makarius
|
wenzelm@24579
|
3 |
|
wenzelm@24579
|
4 |
Lexical syntax for SML.
|
wenzelm@24579
|
5 |
*)
|
wenzelm@24579
|
6 |
|
wenzelm@24579
|
7 |
signature ML_LEX =
|
wenzelm@24579
|
8 |
sig
|
wenzelm@24579
|
9 |
datatype token_kind =
|
wenzelm@24596
|
10 |
Keyword | Ident | LongIdent | TypeVar | Word | Int | Real | Char | String |
|
wenzelm@24579
|
11 |
Space | Comment | Error of string | EOF
|
wenzelm@24579
|
12 |
eqtype token
|
wenzelm@27732
|
13 |
val stopper: token Scan.stopper
|
wenzelm@24596
|
14 |
val is_regular: token -> bool
|
wenzelm@24596
|
15 |
val is_improper: token -> bool
|
wenzelm@30698
|
16 |
val set_range: Position.range -> token -> token
|
wenzelm@30640
|
17 |
val pos_of: token -> Position.T
|
wenzelm@31474
|
18 |
val end_pos_of: token -> Position.T
|
wenzelm@24579
|
19 |
val kind_of: token -> token_kind
|
wenzelm@27817
|
20 |
val content_of: token -> string
|
wenzelm@31430
|
21 |
val check_content_of: token -> string
|
wenzelm@31366
|
22 |
val flatten: token list -> string
|
wenzelm@30614
|
23 |
val report_token: token -> unit
|
wenzelm@24579
|
24 |
val keywords: string list
|
wenzelm@24596
|
25 |
val source: (Symbol.symbol, 'a) Source.source ->
|
wenzelm@30576
|
26 |
(token, (Symbol_Pos.T, Position.T * (Symbol.symbol, 'a) Source.source)
|
wenzelm@27772
|
27 |
Source.source) Source.source
|
wenzelm@30594
|
28 |
val tokenize: string -> token list
|
wenzelm@37199
|
29 |
val read: Position.T -> Symbol_Pos.text -> token Antiquote.antiquote list
|
wenzelm@24579
|
30 |
end;
|
wenzelm@24579
|
31 |
|
wenzelm@24579
|
32 |
structure ML_Lex: ML_LEX =
|
wenzelm@24579
|
33 |
struct
|
wenzelm@24579
|
34 |
|
wenzelm@24579
|
35 |
(** tokens **)
|
wenzelm@24579
|
36 |
|
wenzelm@24579
|
37 |
(* datatype token *)
|
wenzelm@24579
|
38 |
|
wenzelm@24579
|
39 |
datatype token_kind =
|
wenzelm@24596
|
40 |
Keyword | Ident | LongIdent | TypeVar | Word | Int | Real | Char | String |
|
wenzelm@24579
|
41 |
Space | Comment | Error of string | EOF;
|
wenzelm@24579
|
42 |
|
wenzelm@27772
|
43 |
datatype token = Token of Position.range * (token_kind * string);
|
wenzelm@27772
|
44 |
|
wenzelm@27772
|
45 |
|
wenzelm@27772
|
46 |
(* position *)
|
wenzelm@27772
|
47 |
|
wenzelm@30698
|
48 |
fun set_range range (Token (_, x)) = Token (range, x);
|
wenzelm@30698
|
49 |
|
wenzelm@30640
|
50 |
fun pos_of (Token ((pos, _), _)) = pos;
|
wenzelm@30640
|
51 |
fun end_pos_of (Token ((_, pos), _)) = pos;
|
wenzelm@24579
|
52 |
|
wenzelm@24579
|
53 |
|
wenzelm@24596
|
54 |
(* control tokens *)
|
wenzelm@24579
|
55 |
|
wenzelm@27772
|
56 |
fun mk_eof pos = Token ((pos, Position.none), (EOF, ""));
|
wenzelm@27772
|
57 |
val eof = mk_eof Position.none;
|
wenzelm@24579
|
58 |
|
wenzelm@24579
|
59 |
fun is_eof (Token (_, (EOF, _))) = true
|
wenzelm@24579
|
60 |
| is_eof _ = false;
|
wenzelm@24579
|
61 |
|
wenzelm@27772
|
62 |
val stopper =
|
wenzelm@30640
|
63 |
Scan.stopper (fn [] => eof | toks => mk_eof (end_pos_of (List.last toks))) is_eof;
|
wenzelm@24579
|
64 |
|
wenzelm@24579
|
65 |
|
wenzelm@27772
|
66 |
(* token content *)
|
wenzelm@27772
|
67 |
|
wenzelm@31430
|
68 |
fun kind_of (Token (_, (k, _))) = k;
|
wenzelm@31430
|
69 |
|
wenzelm@27817
|
70 |
fun content_of (Token (_, (_, x))) = x;
|
wenzelm@27817
|
71 |
fun token_leq (tok, tok') = content_of tok <= content_of tok';
|
wenzelm@27772
|
72 |
|
wenzelm@31430
|
73 |
fun check_content_of tok =
|
wenzelm@30640
|
74 |
(case kind_of tok of
|
wenzelm@30649
|
75 |
Error msg => error msg
|
wenzelm@40576
|
76 |
| _ =>
|
wenzelm@40576
|
77 |
(case tok of
|
wenzelm@40576
|
78 |
Token (_, (Keyword, ":>")) =>
|
wenzelm@40576
|
79 |
warning ("Opaque signature matching (:>) fails to work with ML pretty printing --\n\
|
wenzelm@40576
|
80 |
\prefer non-opaque matching (:) possibly with abstype" ^
|
wenzelm@40576
|
81 |
Position.str_of (pos_of tok))
|
wenzelm@40576
|
82 |
| _ => ();
|
wenzelm@40576
|
83 |
content_of tok));
|
wenzelm@30640
|
84 |
|
wenzelm@31430
|
85 |
val flatten = implode o map (Symbol.escape o check_content_of);
|
wenzelm@31366
|
86 |
|
wenzelm@24596
|
87 |
fun is_regular (Token (_, (Error _, _))) = false
|
wenzelm@24596
|
88 |
| is_regular (Token (_, (EOF, _))) = false
|
wenzelm@24596
|
89 |
| is_regular _ = true;
|
wenzelm@24596
|
90 |
|
wenzelm@24596
|
91 |
fun is_improper (Token (_, (Space, _))) = true
|
wenzelm@24596
|
92 |
| is_improper (Token (_, (Comment, _))) = true
|
wenzelm@24596
|
93 |
| is_improper _ = false;
|
wenzelm@24596
|
94 |
|
wenzelm@24596
|
95 |
|
wenzelm@30614
|
96 |
(* markup *)
|
wenzelm@30614
|
97 |
|
wenzelm@37196
|
98 |
local
|
wenzelm@37196
|
99 |
|
wenzelm@30649
|
100 |
val token_kind_markup =
|
wenzelm@30649
|
101 |
fn Keyword => Markup.ML_keyword
|
wenzelm@30649
|
102 |
| Ident => Markup.ML_ident
|
wenzelm@30649
|
103 |
| LongIdent => Markup.ML_ident
|
wenzelm@30649
|
104 |
| TypeVar => Markup.ML_tvar
|
wenzelm@30649
|
105 |
| Word => Markup.ML_numeral
|
wenzelm@30649
|
106 |
| Int => Markup.ML_numeral
|
wenzelm@30649
|
107 |
| Real => Markup.ML_numeral
|
wenzelm@30649
|
108 |
| Char => Markup.ML_char
|
wenzelm@30649
|
109 |
| String => Markup.ML_string
|
wenzelm@38792
|
110 |
| Space => Markup.empty
|
wenzelm@30649
|
111 |
| Comment => Markup.ML_comment
|
wenzelm@30649
|
112 |
| Error _ => Markup.ML_malformed
|
wenzelm@38792
|
113 |
| EOF => Markup.empty;
|
wenzelm@30614
|
114 |
|
wenzelm@37196
|
115 |
fun token_markup kind x =
|
wenzelm@37196
|
116 |
if kind = Keyword andalso exists_string (not o Symbol.is_ascii_letter) x
|
wenzelm@37196
|
117 |
then Markup.ML_delimiter
|
wenzelm@37196
|
118 |
else token_kind_markup kind;
|
wenzelm@37196
|
119 |
|
wenzelm@37196
|
120 |
in
|
wenzelm@37196
|
121 |
|
wenzelm@39776
|
122 |
fun report_token (Token ((pos, _), (kind, x))) = Position.report pos (token_markup kind x);
|
wenzelm@37196
|
123 |
|
wenzelm@37196
|
124 |
end;
|
wenzelm@30614
|
125 |
|
wenzelm@30614
|
126 |
|
wenzelm@24579
|
127 |
|
wenzelm@24579
|
128 |
(** scanners **)
|
wenzelm@24579
|
129 |
|
wenzelm@30576
|
130 |
open Basic_Symbol_Pos;
|
wenzelm@24579
|
131 |
|
wenzelm@30576
|
132 |
fun !!! msg = Symbol_Pos.!!! ("SML lexical error: " ^ msg);
|
wenzelm@24579
|
133 |
|
wenzelm@24579
|
134 |
|
wenzelm@27772
|
135 |
(* blanks *)
|
wenzelm@24579
|
136 |
|
wenzelm@27772
|
137 |
val scan_blank = Scan.one (Symbol.is_ascii_blank o symbol);
|
wenzelm@27772
|
138 |
val scan_blanks1 = Scan.repeat1 scan_blank;
|
wenzelm@24579
|
139 |
|
wenzelm@24579
|
140 |
|
wenzelm@24579
|
141 |
(* keywords *)
|
wenzelm@24579
|
142 |
|
wenzelm@24579
|
143 |
val keywords = ["#", "(", ")", ",", "->", "...", ":", ":>", ";", "=",
|
wenzelm@24579
|
144 |
"=>", "[", "]", "_", "{", "|", "}", "abstype", "and", "andalso", "as",
|
wenzelm@24579
|
145 |
"case", "datatype", "do", "else", "end", "eqtype", "exception", "fn",
|
wenzelm@24579
|
146 |
"fun", "functor", "handle", "if", "in", "include", "infix", "infixr",
|
wenzelm@24579
|
147 |
"let", "local", "nonfix", "of", "op", "open", "orelse", "raise", "rec",
|
wenzelm@24579
|
148 |
"sharing", "sig", "signature", "struct", "structure", "then", "type",
|
wenzelm@24579
|
149 |
"val", "where", "while", "with", "withtype"];
|
wenzelm@24579
|
150 |
|
wenzelm@27772
|
151 |
val lex = Scan.make_lexicon (map explode keywords);
|
wenzelm@27772
|
152 |
fun scan_keyword x = Scan.literal lex x;
|
wenzelm@24579
|
153 |
|
wenzelm@24579
|
154 |
|
wenzelm@24579
|
155 |
(* identifiers *)
|
wenzelm@24579
|
156 |
|
wenzelm@24596
|
157 |
local
|
wenzelm@24596
|
158 |
|
wenzelm@24579
|
159 |
val scan_letdigs =
|
wenzelm@27772
|
160 |
Scan.many ((Symbol.is_ascii_letter orf Symbol.is_ascii_digit orf Symbol.is_ascii_quasi) o symbol);
|
wenzelm@24579
|
161 |
|
wenzelm@27772
|
162 |
val scan_alphanumeric = Scan.one (Symbol.is_ascii_letter o symbol) -- scan_letdigs >> op ::;
|
wenzelm@24579
|
163 |
|
wenzelm@27772
|
164 |
val scan_symbolic = Scan.many1 (member (op =) (explode "!#$%&*+-/:<=>?@\\^`|~") o symbol);
|
wenzelm@24579
|
165 |
|
wenzelm@24596
|
166 |
in
|
wenzelm@24596
|
167 |
|
wenzelm@24579
|
168 |
val scan_ident = scan_alphanumeric || scan_symbolic;
|
wenzelm@24579
|
169 |
|
wenzelm@24579
|
170 |
val scan_longident =
|
wenzelm@27772
|
171 |
(Scan.repeat1 (scan_alphanumeric @@@ $$$ ".") >> flat) @@@ (scan_ident || $$$ "=");
|
wenzelm@24579
|
172 |
|
wenzelm@27772
|
173 |
val scan_typevar = $$$ "'" @@@ scan_letdigs;
|
wenzelm@24579
|
174 |
|
wenzelm@24596
|
175 |
end;
|
wenzelm@24579
|
176 |
|
wenzelm@24579
|
177 |
|
wenzelm@24579
|
178 |
(* numerals *)
|
wenzelm@24579
|
179 |
|
wenzelm@24596
|
180 |
local
|
wenzelm@24596
|
181 |
|
wenzelm@27772
|
182 |
val scan_dec = Scan.many1 (Symbol.is_ascii_digit o symbol);
|
wenzelm@27772
|
183 |
val scan_hex = Scan.many1 (Symbol.is_ascii_hex o symbol);
|
wenzelm@27772
|
184 |
val scan_sign = Scan.optional ($$$ "~") [];
|
wenzelm@27772
|
185 |
val scan_decint = scan_sign @@@ scan_dec;
|
wenzelm@24579
|
186 |
|
wenzelm@24596
|
187 |
in
|
wenzelm@24596
|
188 |
|
wenzelm@27772
|
189 |
val scan_word =
|
wenzelm@27772
|
190 |
$$$ "0" @@@ $$$ "w" @@@ $$$ "x" @@@ scan_hex ||
|
wenzelm@27772
|
191 |
$$$ "0" @@@ $$$ "w" @@@ scan_dec;
|
wenzelm@24579
|
192 |
|
wenzelm@27772
|
193 |
val scan_int = scan_sign @@@ ($$$ "0" @@@ $$$ "x" @@@ scan_hex || scan_dec);
|
wenzelm@24579
|
194 |
|
wenzelm@27772
|
195 |
val scan_exp = ($$$ "E" || $$$ "e") @@@ scan_decint;
|
wenzelm@24579
|
196 |
|
wenzelm@24579
|
197 |
val scan_real =
|
wenzelm@27772
|
198 |
scan_decint @@@ $$$ "." @@@ scan_dec @@@ Scan.optional scan_exp [] ||
|
wenzelm@27772
|
199 |
scan_decint @@@ scan_exp;
|
wenzelm@24579
|
200 |
|
wenzelm@24596
|
201 |
end;
|
wenzelm@24596
|
202 |
|
wenzelm@24579
|
203 |
|
wenzelm@24579
|
204 |
(* chars and strings *)
|
wenzelm@24579
|
205 |
|
wenzelm@24596
|
206 |
local
|
wenzelm@24596
|
207 |
|
wenzelm@24596
|
208 |
val scan_escape =
|
wenzelm@27772
|
209 |
Scan.one (member (op =) (explode "\"\\abtnvfr") o symbol) >> single ||
|
wenzelm@27772
|
210 |
$$$ "^" @@@ (Scan.one (fn (s, _) => ord "@" <= ord s andalso ord s <= ord "_") >> single) ||
|
wenzelm@27772
|
211 |
Scan.one (Symbol.is_ascii_digit o symbol) --
|
wenzelm@27772
|
212 |
Scan.one (Symbol.is_ascii_digit o symbol) --
|
wenzelm@27772
|
213 |
Scan.one (Symbol.is_ascii_digit o symbol) >> (fn ((a, b), c) => [a, b, c]);
|
wenzelm@24596
|
214 |
|
wenzelm@24596
|
215 |
val scan_str =
|
wenzelm@30600
|
216 |
Scan.one (fn (s, _) => Symbol.is_regular s andalso s <> "\"" andalso s <> "\\" andalso
|
wenzelm@30600
|
217 |
(not (Symbol.is_char s) orelse Symbol.is_printable s)) >> single ||
|
wenzelm@27772
|
218 |
$$$ "\\" @@@ !!! "bad escape character in string" scan_escape;
|
wenzelm@24596
|
219 |
|
wenzelm@27772
|
220 |
val scan_gap = $$$ "\\" @@@ scan_blanks1 @@@ $$$ "\\";
|
wenzelm@27772
|
221 |
val scan_gaps = Scan.repeat scan_gap >> flat;
|
wenzelm@24579
|
222 |
|
wenzelm@24596
|
223 |
in
|
wenzelm@24579
|
224 |
|
wenzelm@24579
|
225 |
val scan_char =
|
wenzelm@27772
|
226 |
$$$ "#" @@@ $$$ "\"" @@@ scan_gaps @@@ scan_str @@@ scan_gaps @@@ $$$ "\"";
|
wenzelm@24579
|
227 |
|
wenzelm@24579
|
228 |
val scan_string =
|
wenzelm@27772
|
229 |
$$$ "\"" @@@ !!! "missing quote at end of string"
|
wenzelm@27772
|
230 |
((Scan.repeat (scan_gap || scan_str) >> flat) @@@ $$$ "\"");
|
wenzelm@24596
|
231 |
|
wenzelm@24596
|
232 |
end;
|
wenzelm@24579
|
233 |
|
wenzelm@24579
|
234 |
|
wenzelm@30649
|
235 |
(* scan tokens *)
|
wenzelm@24579
|
236 |
|
wenzelm@24579
|
237 |
local
|
wenzelm@24579
|
238 |
|
wenzelm@30596
|
239 |
fun token k ss = Token (Symbol_Pos.range ss, (k, Symbol_Pos.content ss));
|
wenzelm@24579
|
240 |
|
wenzelm@30596
|
241 |
val scan_ml =
|
wenzelm@27772
|
242 |
(scan_char >> token Char ||
|
wenzelm@27772
|
243 |
scan_string >> token String ||
|
wenzelm@27772
|
244 |
scan_blanks1 >> token Space ||
|
wenzelm@30576
|
245 |
Symbol_Pos.scan_comment !!! >> token Comment ||
|
wenzelm@27772
|
246 |
Scan.max token_leq
|
wenzelm@27772
|
247 |
(scan_keyword >> token Keyword)
|
wenzelm@27772
|
248 |
(scan_word >> token Word ||
|
wenzelm@27772
|
249 |
scan_real >> token Real ||
|
wenzelm@27772
|
250 |
scan_int >> token Int ||
|
wenzelm@27772
|
251 |
scan_longident >> token LongIdent ||
|
wenzelm@27772
|
252 |
scan_ident >> token Ident ||
|
wenzelm@27772
|
253 |
scan_typevar >> token TypeVar));
|
wenzelm@27772
|
254 |
|
wenzelm@30649
|
255 |
val scan_antiq = Antiquote.scan || scan_ml >> Antiquote.Text;
|
wenzelm@30649
|
256 |
|
wenzelm@27772
|
257 |
fun recover msg =
|
wenzelm@27772
|
258 |
Scan.many (((not o Symbol.is_blank) andf Symbol.is_regular) o symbol)
|
wenzelm@27772
|
259 |
>> (fn cs => [token (Error msg) cs]);
|
wenzelm@24579
|
260 |
|
wenzelm@24579
|
261 |
in
|
wenzelm@24579
|
262 |
|
wenzelm@24596
|
263 |
fun source src =
|
wenzelm@30576
|
264 |
Symbol_Pos.source (Position.line 1) src
|
wenzelm@30596
|
265 |
|> Source.source Symbol_Pos.stopper (Scan.bulk (!!! "bad input" scan_ml)) (SOME (false, recover));
|
wenzelm@30594
|
266 |
|
wenzelm@31546
|
267 |
val tokenize =
|
wenzelm@31546
|
268 |
Source.of_string #>
|
wenzelm@31546
|
269 |
Symbol.source {do_recover = true} #>
|
wenzelm@31546
|
270 |
source #>
|
wenzelm@31546
|
271 |
Source.exhaust;
|
wenzelm@24579
|
272 |
|
wenzelm@37199
|
273 |
fun read pos txt =
|
wenzelm@37199
|
274 |
let
|
wenzelm@39776
|
275 |
val _ = Position.report pos Markup.ML_source;
|
wenzelm@37199
|
276 |
val syms = Symbol_Pos.explode (txt, pos);
|
wenzelm@37214
|
277 |
val termination =
|
wenzelm@37214
|
278 |
if null syms then []
|
wenzelm@37214
|
279 |
else
|
wenzelm@37214
|
280 |
let
|
wenzelm@37214
|
281 |
val pos1 = List.last syms |-> Position.advance;
|
wenzelm@37214
|
282 |
val pos2 = Position.advance Symbol.space pos1;
|
wenzelm@37214
|
283 |
in [Antiquote.Text (Token (Position.range pos1 pos2, (Space, Symbol.space)))] end;
|
wenzelm@37214
|
284 |
val input =
|
wenzelm@37214
|
285 |
(Source.of_list syms
|
wenzelm@37214
|
286 |
|> Source.source Symbol_Pos.stopper (Scan.bulk (!!! "bad input" scan_antiq))
|
wenzelm@37214
|
287 |
(SOME (false, fn msg => recover msg >> map Antiquote.Text))
|
wenzelm@37214
|
288 |
|> Source.exhaust
|
wenzelm@37214
|
289 |
|> tap (List.app (Antiquote.report report_token))
|
wenzelm@37214
|
290 |
|> tap Antiquote.check_nesting
|
wenzelm@37214
|
291 |
|> tap (List.app (fn Antiquote.Text tok => ignore (check_content_of tok) | _ => ())))
|
wenzelm@37214
|
292 |
handle ERROR msg =>
|
wenzelm@37214
|
293 |
cat_error msg ("The error(s) above occurred in ML source" ^ Position.str_of pos);
|
wenzelm@37214
|
294 |
in input @ termination end;
|
wenzelm@30649
|
295 |
|
wenzelm@24579
|
296 |
end;
|
wenzelm@24579
|
297 |
|
wenzelm@24596
|
298 |
end;
|
wenzelm@24579
|
299 |
|