1.1 --- a/src/Pure/ML/ml_lex.ML Sun Sep 16 14:52:29 2007 +0200
1.2 +++ b/src/Pure/ML/ml_lex.ML Sun Sep 16 14:52:31 2007 +0200
1.3 @@ -8,23 +8,18 @@
1.4 signature ML_LEX =
1.5 sig
1.6 datatype token_kind =
1.7 - Keyword | Ident | LongIdent | TypeVar | Selector | Word | Int | Real | Char | String |
1.8 + Keyword | Ident | LongIdent | TypeVar | Word | Int | Real | Char | String |
1.9 Space | Comment | Error of string | EOF
1.10 eqtype token
1.11 - val str_of_kind: token_kind -> string
1.12 val stopper: token * (token -> bool)
1.13 - val not_eof: token -> bool
1.14 + val is_regular: token -> bool
1.15 + val is_improper: token -> bool
1.16 + val pos_of: token -> string
1.17 val kind_of: token -> token_kind
1.18 - val is_kind: token_kind -> token -> bool
1.19 - val is_proper: token -> bool
1.20 val val_of: token -> string
1.21 - val !!! : string -> (int * 'a -> 'b) -> int * 'a -> 'b
1.22 val keywords: string list
1.23 - val scan: int * string list -> token * (int * string list)
1.24 - val source: bool option -> (string, 'a) Source.source ->
1.25 - (token, int * (string, 'a) Source.source) Source.source
1.26 - val source_proper: (token, 'a) Source.source ->
1.27 - (token, (token, 'a) Source.source) Source.source
1.28 + val source: (Symbol.symbol, 'a) Source.source ->
1.29 + (token, int * (Symbol.symbol, 'a) Source.source) Source.source
1.30 end;
1.31
1.32 structure ML_Lex: ML_LEX =
1.33 @@ -35,29 +30,13 @@
1.34 (* datatype token *)
1.35
1.36 datatype token_kind =
1.37 - Keyword | Ident | LongIdent | TypeVar | Selector | Word | Int | Real | Char | String |
1.38 + Keyword | Ident | LongIdent | TypeVar | Word | Int | Real | Char | String |
1.39 Space | Comment | Error of string | EOF;
1.40
1.41 datatype token = Token of int * (token_kind * string);
1.42
1.43 -val str_of_kind =
1.44 - fn Keyword => "keyword"
1.45 - | Ident => "identifier"
1.46 - | LongIdent => "long identifier"
1.47 - | TypeVar => "type variable"
1.48 - | Selector => "record selector"
1.49 - | Word => "word"
1.50 - | Int => "integer"
1.51 - | Real => "real"
1.52 - | Char => "character"
1.53 - | String => "string"
1.54 - | Space => "white space"
1.55 - | Comment => "comment"
1.56 - | Error _ => "bad input"
1.57 - | EOF => "end-of-file";
1.58
1.59 -
1.60 -(* end-of-file *)
1.61 +(* control tokens *)
1.62
1.63 val eof = Token (0, (EOF, ""));
1.64
1.65 @@ -65,22 +44,22 @@
1.66 | is_eof _ = false;
1.67
1.68 val stopper = (eof, is_eof);
1.69 -val not_eof = not o is_eof;
1.70
1.71
1.72 -(* token kind *)
1.73 +fun is_regular (Token (_, (Error _, _))) = false
1.74 + | is_regular (Token (_, (EOF, _))) = false
1.75 + | is_regular _ = true;
1.76 +
1.77 +fun is_improper (Token (_, (Space, _))) = true
1.78 + | is_improper (Token (_, (Comment, _))) = true
1.79 + | is_improper _ = false;
1.80 +
1.81 +
1.82 +(* token content *)
1.83 +
1.84 +fun pos_of (Token (n, _)) = " (line " ^ string_of_int n ^ ")";
1.85
1.86 fun kind_of (Token (_, (k, _))) = k;
1.87 -
1.88 -fun is_kind k (Token (_, (k', _))) = k = k';
1.89 -
1.90 -fun is_proper (Token (_, (Space, _))) = false
1.91 - | is_proper (Token (_, (Comment, _))) = false
1.92 - | is_proper _ = true;
1.93 -
1.94 -
1.95 -(* token value *)
1.96 -
1.97 fun val_of (Token (_, (_, x))) = x;
1.98
1.99 fun token_leq (Token (_, (_, x)), Token (_, (_, x'))) = x <= x';
1.100 @@ -97,7 +76,7 @@
1.101
1.102 (* line numbering *)
1.103
1.104 -fun incr_line scan = Scan.depend (fn n => scan >> pair (n + 1));
1.105 +fun incr_line scan = Scan.depend (fn (n: int) => scan >> pair (n + 1));
1.106 val keep_line = Scan.lift;
1.107
1.108 val scan_blank =
1.109 @@ -123,6 +102,8 @@
1.110
1.111 (* identifiers *)
1.112
1.113 +local
1.114 +
1.115 val scan_letdigs =
1.116 Scan.many (Symbol.is_ascii_letter orf Symbol.is_ascii_digit orf Symbol.is_ascii_quasi) >> implode;
1.117
1.118 @@ -130,6 +111,8 @@
1.119
1.120 val scan_symbolic = Scan.many1 (member (op =) (explode "!#$%&*+-/:<=>?@\\^`|~")) >> implode;
1.121
1.122 +in
1.123 +
1.124 val scan_ident = scan_alphanumeric || scan_symbolic;
1.125
1.126 val scan_longident =
1.127 @@ -137,24 +120,20 @@
1.128
1.129 val scan_typevar = $$ "'" ^^ scan_letdigs;
1.130
1.131 -
1.132 -(* selectors -- not fully standard conformant *)
1.133 -
1.134 -val scan_numeric =
1.135 - Scan.one (member (op =) (explode "123456789")) ^^ (Scan.many Symbol.is_ascii_digit >> implode);
1.136 -
1.137 -val scan_selector =
1.138 - keep_line ($$ "#") ^^ scan_blanks ^^
1.139 - !!! "malformed record selector" (keep_line (scan_numeric || scan_alphanumeric || scan_symbolic));
1.140 +end;
1.141
1.142
1.143 (* numerals *)
1.144
1.145 +local
1.146 +
1.147 val scan_dec = Scan.many1 Symbol.is_ascii_digit >> implode;
1.148 val scan_hex = Scan.many1 Symbol.is_ascii_hex >> implode;
1.149 val scan_sign = Scan.optional ($$ "~") "";
1.150 val scan_decint = scan_sign ^^ scan_dec;
1.151
1.152 +in
1.153 +
1.154 val scan_word = Scan.this_string "0wx" ^^ scan_hex || Scan.this_string "0w" ^^ scan_dec;
1.155
1.156 val scan_int = scan_sign ^^ (Scan.this_string "0x" ^^ scan_hex || scan_dec);
1.157 @@ -165,40 +144,59 @@
1.158 scan_decint ^^ $$ "." ^^ scan_dec ^^ Scan.optional scan_exp "" ||
1.159 scan_decint ^^ scan_exp;
1.160
1.161 +end;
1.162 +
1.163
1.164 (* chars and strings *)
1.165
1.166 +local
1.167 +
1.168 +val scan_escape =
1.169 + Scan.one (member (op =) (explode "\"\\abtnvfr")) ||
1.170 + $$ "^" ^^ Scan.one (fn s => ord "@" <= ord s andalso ord s <= ord "_") ||
1.171 + Scan.one Symbol.is_ascii_digit ^^
1.172 + Scan.one Symbol.is_ascii_digit ^^
1.173 + Scan.one Symbol.is_ascii_digit;
1.174 +
1.175 +val scan_str =
1.176 + keep_line (Scan.one (fn s => Symbol.is_printable s andalso s <> "\"" andalso s <> "\\")) ||
1.177 + keep_line ($$ "\\") ^^ !!! "bad escape character in string" (keep_line scan_escape);
1.178 +
1.179 val scan_gap = keep_line ($$ "\\") ^^ scan_blanks1 ^^ keep_line ($$ "\\");
1.180 val scan_gaps = Scan.repeat scan_gap >> implode;
1.181
1.182 -val scan_str =
1.183 - scan_blank ||
1.184 - scan_gap ||
1.185 - keep_line ($$ "\\") |-- !!! "bad escape character in string"
1.186 - (scan_blank || keep_line ($$ "\"" || $$ "\\")) ||
1.187 - keep_line (Scan.one (fn s => s <> "\"" andalso s <> "\\" andalso Symbol.is_regular s));
1.188 +in
1.189
1.190 val scan_char =
1.191 keep_line ($$ "#" ^^ $$ "\"") ^^ scan_gaps ^^ scan_str ^^ scan_gaps ^^ keep_line ($$ "\"");
1.192
1.193 val scan_string =
1.194 keep_line ($$ "\"") ^^
1.195 - !!! "missing quote at end of string" ((Scan.repeat scan_str >> implode) ^^ keep_line ($$ "\""));
1.196 + !!! "missing quote at end of string"
1.197 + ((Scan.repeat (scan_gap || scan_str) >> implode) ^^ keep_line ($$ "\""));
1.198 +
1.199 +end;
1.200
1.201
1.202 (* scan nested comments *)
1.203
1.204 +local
1.205 +
1.206 val scan_cmt =
1.207 Scan.lift scan_blank ||
1.208 - Scan.depend (fn d => keep_line ($$ "(" ^^ $$ "*") >> pair (d + 1)) ||
1.209 + Scan.depend (fn (d: int) => keep_line ($$ "(" ^^ $$ "*") >> pair (d + 1)) ||
1.210 Scan.depend (fn 0 => Scan.fail | d => keep_line ($$ "*" ^^ $$ ")") >> pair (d - 1)) ||
1.211 Scan.lift (keep_line ($$ "*" --| Scan.ahead (~$$ ")"))) ||
1.212 Scan.lift (keep_line (Scan.one (fn s => s <> "*" andalso Symbol.is_regular s)));
1.213
1.214 +in
1.215 +
1.216 val scan_comment =
1.217 keep_line ($$ "(" ^^ $$ "*") ^^
1.218 - !!! "missing end of comment"
1.219 - (Scan.pass 0 (Scan.repeat scan_cmt >> implode) ^^ keep_line ($$ "*" ^^ $$ ")"));
1.220 + !!! "missing end of comment"
1.221 + (Scan.pass 0 (Scan.repeat scan_cmt >> implode) ^^ keep_line ($$ "*" ^^ $$ ")"));
1.222 +
1.223 +end;
1.224
1.225
1.226 (* scan token *)
1.227 @@ -210,23 +208,22 @@
1.228 fun token k x = Token (n, (k, x));
1.229 in
1.230 scan_char >> token Char ||
1.231 - scan_selector >> token Selector ||
1.232 scan_string >> token String ||
1.233 scan_blanks1 >> token Space ||
1.234 scan_comment >> token Comment ||
1.235 keep_line (Scan.max token_leq
1.236 (scan_keyword >> token Keyword)
1.237 - (scan_longident >> token LongIdent ||
1.238 + (scan_word >> token Word ||
1.239 + scan_real >> token Real ||
1.240 + scan_int >> token Int ||
1.241 + scan_longident >> token LongIdent ||
1.242 scan_ident >> token Ident ||
1.243 - scan_typevar >> token TypeVar ||
1.244 - scan_real >> token Real ||
1.245 - scan_word >> token Word ||
1.246 - scan_int >> token Int))
1.247 + scan_typevar >> token TypeVar))
1.248 end);
1.249 in !! (lex_err (fn cs => "bad input " ^ quote (Symbol.beginning 10 cs))) scanner end;
1.250
1.251
1.252 -(* token sources *)
1.253 +(* token source *)
1.254
1.255 local
1.256
1.257 @@ -237,12 +234,10 @@
1.258
1.259 in
1.260
1.261 -fun source do_recover src =
1.262 - Source.source' 1 Symbol.stopper (Scan.bulk scan) (Option.map (rpair recover) do_recover) src;
1.263 +fun source src =
1.264 + Source.source' 1 Symbol.stopper (Scan.bulk scan) (SOME (false, recover)) src;
1.265
1.266 end;
1.267
1.268 -fun source_proper src = src |> Source.filter is_proper;
1.269 +end;
1.270
1.271 -
1.272 -end;