src/Pure/ML/ml_lex.ML
changeset 24596 f1333a841b26
parent 24579 852fc50927b1
child 27732 8dbf5761a24a
     1.1 --- a/src/Pure/ML/ml_lex.ML	Sun Sep 16 14:52:29 2007 +0200
     1.2 +++ b/src/Pure/ML/ml_lex.ML	Sun Sep 16 14:52:31 2007 +0200
     1.3 @@ -8,23 +8,18 @@
     1.4  signature ML_LEX =
     1.5  sig
     1.6    datatype token_kind =
     1.7 -    Keyword | Ident | LongIdent | TypeVar | Selector | Word | Int | Real | Char | String |
     1.8 +    Keyword | Ident | LongIdent | TypeVar | Word | Int | Real | Char | String |
     1.9      Space | Comment | Error of string | EOF
    1.10    eqtype token
    1.11 -  val str_of_kind: token_kind -> string
    1.12    val stopper: token * (token -> bool)
    1.13 -  val not_eof: token -> bool
    1.14 +  val is_regular: token -> bool
    1.15 +  val is_improper: token -> bool
    1.16 +  val pos_of: token -> string
    1.17    val kind_of: token -> token_kind
    1.18 -  val is_kind: token_kind -> token -> bool
    1.19 -  val is_proper: token -> bool
    1.20    val val_of: token -> string
    1.21 -  val !!! : string -> (int * 'a -> 'b) -> int * 'a -> 'b
    1.22    val keywords: string list
    1.23 -  val scan: int * string list -> token * (int * string list)
    1.24 -  val source: bool option -> (string, 'a) Source.source ->
    1.25 -    (token, int * (string, 'a) Source.source) Source.source
    1.26 -  val source_proper: (token, 'a) Source.source ->
    1.27 -    (token, (token, 'a) Source.source) Source.source
    1.28 +  val source: (Symbol.symbol, 'a) Source.source ->
    1.29 +    (token, int * (Symbol.symbol, 'a) Source.source) Source.source
    1.30  end;
    1.31  
    1.32  structure ML_Lex: ML_LEX =
    1.33 @@ -35,29 +30,13 @@
    1.34  (* datatype token *)
    1.35  
    1.36  datatype token_kind =
    1.37 -  Keyword | Ident | LongIdent | TypeVar | Selector | Word | Int | Real | Char | String |
    1.38 +  Keyword | Ident | LongIdent | TypeVar | Word | Int | Real | Char | String |
    1.39    Space | Comment | Error of string | EOF;
    1.40  
    1.41  datatype token = Token of int * (token_kind * string);
    1.42  
    1.43 -val str_of_kind =
    1.44 - fn Keyword => "keyword"
    1.45 -  | Ident => "identifier"
    1.46 -  | LongIdent => "long identifier"
    1.47 -  | TypeVar => "type variable"
    1.48 -  | Selector => "record selector"
    1.49 -  | Word => "word"
    1.50 -  | Int => "integer"
    1.51 -  | Real => "real"
    1.52 -  | Char => "character"
    1.53 -  | String => "string"
    1.54 -  | Space => "white space"
    1.55 -  | Comment => "comment"
    1.56 -  | Error _ => "bad input"
    1.57 -  | EOF => "end-of-file";
    1.58  
    1.59 -
    1.60 -(* end-of-file *)
    1.61 +(* control tokens *)
    1.62  
    1.63  val eof = Token (0, (EOF, ""));
    1.64  
    1.65 @@ -65,22 +44,22 @@
    1.66    | is_eof _ = false;
    1.67  
    1.68  val stopper = (eof, is_eof);
    1.69 -val not_eof = not o is_eof;
    1.70  
    1.71  
    1.72 -(* token kind *)
    1.73 +fun is_regular (Token (_, (Error _, _))) = false
    1.74 +  | is_regular (Token (_, (EOF, _))) = false
    1.75 +  | is_regular _ = true;
    1.76 +
    1.77 +fun is_improper (Token (_, (Space, _))) = true
    1.78 +  | is_improper (Token (_, (Comment, _))) = true
    1.79 +  | is_improper _ = false;
    1.80 +
    1.81 +
    1.82 +(* token content *)
    1.83 +
    1.84 +fun pos_of (Token (n, _)) = " (line " ^ string_of_int n ^ ")";
    1.85  
    1.86  fun kind_of (Token (_, (k, _))) = k;
    1.87 -
    1.88 -fun is_kind k (Token (_, (k', _))) = k = k';
    1.89 -
    1.90 -fun is_proper (Token (_, (Space, _))) = false
    1.91 -  | is_proper (Token (_, (Comment, _))) = false
    1.92 -  | is_proper _ = true;
    1.93 -
    1.94 -
    1.95 -(* token value *)
    1.96 -
    1.97  fun val_of (Token (_, (_, x))) = x;
    1.98  
    1.99  fun token_leq (Token (_, (_, x)), Token (_, (_, x'))) = x <= x';
   1.100 @@ -97,7 +76,7 @@
   1.101  
   1.102  (* line numbering *)
   1.103  
   1.104 -fun incr_line scan = Scan.depend (fn n => scan >> pair (n + 1));
   1.105 +fun incr_line scan = Scan.depend (fn (n: int) => scan >> pair (n + 1));
   1.106  val keep_line = Scan.lift;
   1.107  
   1.108  val scan_blank =
   1.109 @@ -123,6 +102,8 @@
   1.110  
   1.111  (* identifiers *)
   1.112  
   1.113 +local
   1.114 +
   1.115  val scan_letdigs =
   1.116    Scan.many (Symbol.is_ascii_letter orf Symbol.is_ascii_digit orf Symbol.is_ascii_quasi) >> implode;
   1.117  
   1.118 @@ -130,6 +111,8 @@
   1.119  
   1.120  val scan_symbolic = Scan.many1 (member (op =) (explode "!#$%&*+-/:<=>?@\\^`|~")) >> implode;
   1.121  
   1.122 +in
   1.123 +
   1.124  val scan_ident = scan_alphanumeric || scan_symbolic;
   1.125  
   1.126  val scan_longident =
   1.127 @@ -137,24 +120,20 @@
   1.128  
   1.129  val scan_typevar = $$ "'" ^^ scan_letdigs;
   1.130  
   1.131 -
   1.132 -(* selectors -- not fully standard conformant *)
   1.133 -
   1.134 -val scan_numeric =
   1.135 -  Scan.one (member (op =) (explode "123456789")) ^^ (Scan.many Symbol.is_ascii_digit >> implode);
   1.136 -
   1.137 -val scan_selector =
   1.138 -  keep_line ($$ "#") ^^ scan_blanks ^^
   1.139 -  !!! "malformed record selector" (keep_line (scan_numeric || scan_alphanumeric || scan_symbolic));
   1.140 +end;
   1.141  
   1.142  
   1.143  (* numerals *)
   1.144  
   1.145 +local
   1.146 +
   1.147  val scan_dec = Scan.many1 Symbol.is_ascii_digit >> implode;
   1.148  val scan_hex = Scan.many1 Symbol.is_ascii_hex >> implode;
   1.149  val scan_sign = Scan.optional ($$ "~") "";
   1.150  val scan_decint = scan_sign ^^ scan_dec;
   1.151  
   1.152 +in
   1.153 +
   1.154  val scan_word = Scan.this_string "0wx" ^^ scan_hex || Scan.this_string "0w" ^^ scan_dec;
   1.155  
   1.156  val scan_int = scan_sign ^^ (Scan.this_string "0x" ^^ scan_hex || scan_dec);
   1.157 @@ -165,40 +144,59 @@
   1.158    scan_decint ^^ $$ "." ^^ scan_dec ^^ Scan.optional scan_exp "" ||
   1.159    scan_decint ^^ scan_exp;
   1.160  
   1.161 +end;
   1.162 +
   1.163  
   1.164  (* chars and strings *)
   1.165  
   1.166 +local
   1.167 +
   1.168 +val scan_escape =
   1.169 +  Scan.one (member (op =) (explode "\"\\abtnvfr")) ||
   1.170 +  $$ "^" ^^ Scan.one (fn s => ord "@" <= ord s andalso ord s <= ord "_") ||
   1.171 +  Scan.one Symbol.is_ascii_digit ^^
   1.172 +    Scan.one Symbol.is_ascii_digit ^^
   1.173 +    Scan.one Symbol.is_ascii_digit;
   1.174 +
   1.175 +val scan_str =
   1.176 +  keep_line (Scan.one (fn s => Symbol.is_printable s andalso s <> "\"" andalso s <> "\\")) ||
   1.177 +  keep_line ($$ "\\") ^^ !!! "bad escape character in string" (keep_line scan_escape);
   1.178 +
   1.179  val scan_gap = keep_line ($$ "\\") ^^ scan_blanks1 ^^ keep_line ($$ "\\");
   1.180  val scan_gaps = Scan.repeat scan_gap >> implode;
   1.181  
   1.182 -val scan_str =
   1.183 -  scan_blank ||
   1.184 -  scan_gap ||
   1.185 -  keep_line ($$ "\\") |-- !!! "bad escape character in string"
   1.186 -      (scan_blank || keep_line ($$ "\"" || $$ "\\")) ||
   1.187 -  keep_line (Scan.one (fn s => s <> "\"" andalso s <> "\\" andalso Symbol.is_regular s));
   1.188 +in
   1.189  
   1.190  val scan_char =
   1.191    keep_line ($$ "#" ^^ $$ "\"") ^^ scan_gaps ^^ scan_str ^^ scan_gaps ^^ keep_line ($$ "\"");
   1.192  
   1.193  val scan_string =
   1.194    keep_line ($$ "\"") ^^
   1.195 -    !!! "missing quote at end of string" ((Scan.repeat scan_str >> implode) ^^ keep_line ($$ "\""));
   1.196 +  !!! "missing quote at end of string"
   1.197 +    ((Scan.repeat (scan_gap || scan_str) >> implode) ^^ keep_line ($$ "\""));
   1.198 +
   1.199 +end;
   1.200  
   1.201  
   1.202  (* scan nested comments *)
   1.203  
   1.204 +local
   1.205 +
   1.206  val scan_cmt =
   1.207    Scan.lift scan_blank ||
   1.208 -  Scan.depend (fn d => keep_line ($$ "(" ^^ $$ "*") >> pair (d + 1)) ||
   1.209 +  Scan.depend (fn (d: int) => keep_line ($$ "(" ^^ $$ "*") >> pair (d + 1)) ||
   1.210    Scan.depend (fn 0 => Scan.fail | d => keep_line ($$ "*" ^^ $$ ")") >> pair (d - 1)) ||
   1.211    Scan.lift (keep_line ($$ "*" --| Scan.ahead (~$$ ")"))) ||
   1.212    Scan.lift (keep_line (Scan.one (fn s => s <> "*" andalso Symbol.is_regular s)));
   1.213  
   1.214 +in
   1.215 +
   1.216  val scan_comment =
   1.217    keep_line ($$ "(" ^^ $$ "*") ^^
   1.218 -    !!! "missing end of comment"
   1.219 -      (Scan.pass 0 (Scan.repeat scan_cmt >> implode) ^^ keep_line ($$ "*" ^^ $$ ")"));
   1.220 +  !!! "missing end of comment"
   1.221 +    (Scan.pass 0 (Scan.repeat scan_cmt >> implode) ^^ keep_line ($$ "*" ^^ $$ ")"));
   1.222 +
   1.223 +end;
   1.224  
   1.225  
   1.226  (* scan token *)
   1.227 @@ -210,23 +208,22 @@
   1.228          fun token k x = Token (n, (k, x));
   1.229        in
   1.230          scan_char >> token Char ||
   1.231 -        scan_selector >> token Selector ||
   1.232          scan_string >> token String ||
   1.233          scan_blanks1 >> token Space ||
   1.234          scan_comment >> token Comment ||
   1.235          keep_line (Scan.max token_leq
   1.236            (scan_keyword >> token Keyword)
   1.237 -          (scan_longident >> token LongIdent ||
   1.238 +          (scan_word >> token Word ||
   1.239 +            scan_real >> token Real ||
   1.240 +            scan_int >> token Int ||
   1.241 +            scan_longident >> token LongIdent ||
   1.242              scan_ident >> token Ident ||
   1.243 -            scan_typevar >> token TypeVar ||
   1.244 -            scan_real >> token Real ||
   1.245 -            scan_word >> token Word ||
   1.246 -            scan_int >> token Int))
   1.247 +            scan_typevar >> token TypeVar))
   1.248        end);
   1.249    in !! (lex_err (fn cs => "bad input " ^ quote (Symbol.beginning 10 cs))) scanner end;
   1.250  
   1.251  
   1.252 -(* token sources *)
   1.253 +(* token source *)
   1.254  
   1.255  local
   1.256  
   1.257 @@ -237,12 +234,10 @@
   1.258  
   1.259  in
   1.260  
   1.261 -fun source do_recover src =
   1.262 -  Source.source' 1 Symbol.stopper (Scan.bulk scan) (Option.map (rpair recover) do_recover) src;
   1.263 +fun source src =
   1.264 +  Source.source' 1 Symbol.stopper (Scan.bulk scan) (SOME (false, recover)) src;
   1.265  
   1.266  end;
   1.267  
   1.268 -fun source_proper src = src |> Source.filter is_proper;
   1.269 +end;
   1.270  
   1.271 -
   1.272 -end;