src/Pure/Syntax/lexicon.ML
author wenzelm
Tue, 07 Sep 2010 14:08:21 +0200
changeset 39449 e3ac771235f7
parent 38792 e498dc2eb576
child 39776 839873937ddd
permissions -rw-r--r--
report token range after inner parse error -- often provides important clues about misunderstanding concerning lexical phase;
tuned color;
     1 (*  Title:      Pure/Syntax/lexicon.ML
     2     Author:     Tobias Nipkow and Markus Wenzel, TU Muenchen
     3 
     4 Lexer for the inner Isabelle syntax (terms and types).
     5 *)
     6 
     7 signature LEXICON0 =
     8 sig
     9   val is_identifier: string -> bool
    10   val is_ascii_identifier: string -> bool
    11   val is_tid: string -> bool
    12   val scan_id: Symbol_Pos.T list -> Symbol_Pos.T list * Symbol_Pos.T list
    13   val scan_longid: Symbol_Pos.T list -> Symbol_Pos.T list * Symbol_Pos.T list
    14   val scan_tid: Symbol_Pos.T list -> Symbol_Pos.T list * Symbol_Pos.T list
    15   val scan_nat: Symbol_Pos.T list -> Symbol_Pos.T list * Symbol_Pos.T list
    16   val scan_int: Symbol_Pos.T list -> Symbol_Pos.T list * Symbol_Pos.T list
    17   val scan_hex: Symbol_Pos.T list -> Symbol_Pos.T list * Symbol_Pos.T list
    18   val scan_bin: Symbol_Pos.T list -> Symbol_Pos.T list * Symbol_Pos.T list
    19   val scan_var: Symbol_Pos.T list -> Symbol_Pos.T list * Symbol_Pos.T list
    20   val scan_tvar: Symbol_Pos.T list -> Symbol_Pos.T list * Symbol_Pos.T list
    21   val implode_xstr: string list -> string
    22   val explode_xstr: string -> string list
    23   val read_indexname: string -> indexname
    24   val read_var: string -> term
    25   val read_variable: string -> indexname option
    26   val const: string -> term
    27   val free: string -> term
    28   val var: indexname -> term
    29   val read_nat: string -> int option
    30   val read_int: string -> int option
    31   val read_xnum: string -> {radix: int, leading_zeros: int, value: int}
    32   val read_float: string -> {mant: int, exp: int}
    33   val mark_class: string -> string val unmark_class: string -> string
    34   val mark_type: string -> string val unmark_type: string -> string
    35   val mark_const: string -> string val unmark_const: string -> string
    36   val mark_fixed: string -> string val unmark_fixed: string -> string
    37   val unmark:
    38    {case_class: string -> 'a,
    39     case_type: string -> 'a,
    40     case_const: string -> 'a,
    41     case_fixed: string -> 'a,
    42     case_default: string -> 'a} -> string -> 'a
    43   val is_marked: string -> bool
    44 end;
    45 
    46 signature LEXICON =
    47 sig
    48   include LEXICON0
    49   val is_xid: string -> bool
    50   datatype token_kind =
    51     Literal | IdentSy | LongIdentSy | VarSy | TFreeSy | TVarSy |
    52     NumSy | FloatSy | XNumSy | StrSy | Space | Comment | EOF
    53   datatype token = Token of token_kind * string * Position.range
    54   val str_of_token: token -> string
    55   val pos_of_token: token -> Position.T
    56   val is_proper: token -> bool
    57   val mk_eof: Position.T -> token
    58   val eof: token
    59   val is_eof: token -> bool
    60   val stopper: token Scan.stopper
    61   val idT: typ
    62   val longidT: typ
    63   val varT: typ
    64   val tidT: typ
    65   val tvarT: typ
    66   val terminals: string list
    67   val is_terminal: string -> bool
    68   val report_token: Proof.context -> token -> unit
    69   val report_token_range: Proof.context -> token -> unit
    70   val matching_tokens: token * token -> bool
    71   val valued_token: token -> bool
    72   val predef_term: string -> token option
    73   val tokenize: Scan.lexicon -> bool -> Symbol_Pos.T list -> token list
    74 end;
    75 
    76 structure Lexicon: LEXICON =
    77 struct
    78 
    79 (** is_identifier etc. **)
    80 
    81 val is_identifier = Symbol.is_ident o Symbol.explode;
    82 
    83 fun is_ascii_identifier s =
    84   let val cs = Symbol.explode s
    85   in forall Symbol.is_ascii cs andalso Symbol.is_ident cs end;
    86 
    87 fun is_xid s =
    88   (case Symbol.explode s of
    89     "_" :: cs => Symbol.is_ident cs
    90   | cs => Symbol.is_ident cs);
    91 
    92 fun is_tid s =
    93   (case Symbol.explode s of
    94     "'" :: cs => Symbol.is_ident cs
    95   | _ => false);
    96 
    97 
    98 
    99 (** basic scanners **)
   100 
   101 open Basic_Symbol_Pos;
   102 
   103 fun !!! msg = Symbol_Pos.!!! ("Inner lexical error: " ^ msg);
   104 
   105 val scan_id = Scan.one (Symbol.is_letter o symbol) ::: Scan.many (Symbol.is_letdig o symbol);
   106 val scan_longid = scan_id @@@ (Scan.repeat1 ($$$ "." @@@ scan_id) >> flat);
   107 val scan_tid = $$$ "'" @@@ scan_id;
   108 
   109 val scan_nat = Scan.many1 (Symbol.is_digit o symbol);
   110 val scan_int = $$$ "-" @@@ scan_nat || scan_nat;
   111 val scan_natdot = scan_nat @@@ $$$ "." @@@ scan_nat;
   112 val scan_float = $$$ "-" @@@ scan_natdot || scan_natdot;
   113 val scan_hex = $$$ "0" @@@ $$$ "x" @@@ Scan.many1 (Symbol.is_ascii_hex o symbol);
   114 val scan_bin = $$$ "0" @@@ $$$ "b" @@@ Scan.many1 (fn (s, _) => s = "0" orelse s = "1");
   115 
   116 val scan_id_nat = scan_id @@@ Scan.optional ($$$ "." @@@ scan_nat) [];
   117 val scan_var = $$$ "?" @@@ scan_id_nat;
   118 val scan_tvar = $$$ "?" @@@ $$$ "'" @@@ scan_id_nat;
   119 
   120 
   121 
   122 (** datatype token **)
   123 
   124 datatype token_kind =
   125   Literal | IdentSy | LongIdentSy | VarSy | TFreeSy | TVarSy |
   126   NumSy | FloatSy | XNumSy | StrSy | Space | Comment | EOF;
   127 
   128 datatype token = Token of token_kind * string * Position.range;
   129 
   130 fun str_of_token (Token (_, s, _)) = s;
   131 fun pos_of_token (Token (_, _, (pos, _))) = pos;
   132 
   133 fun is_proper (Token (Space, _, _)) = false
   134   | is_proper (Token (Comment, _, _)) = false
   135   | is_proper _ = true;
   136 
   137 
   138 (* stopper *)
   139 
   140 fun mk_eof pos = Token (EOF, "", (pos, Position.none));
   141 val eof = mk_eof Position.none;
   142 
   143 fun is_eof (Token (EOF, _, _)) = true
   144   | is_eof _ = false;
   145 
   146 val stopper = Scan.stopper (K eof) is_eof;
   147 
   148 
   149 (* terminal arguments *)
   150 
   151 val idT = Type ("id", []);
   152 val longidT = Type ("longid", []);
   153 val varT = Type ("var", []);
   154 val tidT = Type ("tid", []);
   155 val tvarT = Type ("tvar", []);
   156 
   157 val terminal_kinds =
   158  [("id", IdentSy),
   159   ("longid", LongIdentSy),
   160   ("var", VarSy),
   161   ("tid", TFreeSy),
   162   ("tvar", TVarSy),
   163   ("num", NumSy),
   164   ("float_token", FloatSy),
   165   ("xnum", XNumSy),
   166   ("xstr", StrSy)];
   167 
   168 val terminals = map #1 terminal_kinds;
   169 val is_terminal = member (op =) terminals;
   170 
   171 
   172 (* markup *)
   173 
   174 val token_kind_markup =
   175  fn Literal     => Markup.literal
   176   | IdentSy     => Markup.ident
   177   | LongIdentSy => Markup.ident
   178   | VarSy       => Markup.var
   179   | TFreeSy     => Markup.tfree
   180   | TVarSy      => Markup.tvar
   181   | NumSy       => Markup.numeral
   182   | FloatSy     => Markup.numeral
   183   | XNumSy      => Markup.numeral
   184   | StrSy       => Markup.inner_string
   185   | Space       => Markup.empty
   186   | Comment     => Markup.inner_comment
   187   | EOF         => Markup.empty;
   188 
   189 fun report_token ctxt (Token (kind, _, (pos, _))) =
   190   Context_Position.report ctxt (token_kind_markup kind) pos;
   191 
   192 fun report_token_range ctxt tok =
   193   if is_proper tok
   194   then Context_Position.report ctxt Markup.token_range (pos_of_token tok)
   195   else ();
   196 
   197 
   198 (* matching_tokens *)
   199 
   200 fun matching_tokens (Token (Literal, x, _), Token (Literal, y, _)) = x = y
   201   | matching_tokens (Token (k, _, _), Token (k', _, _)) = k = k';
   202 
   203 
   204 (* valued_token *)
   205 
   206 fun valued_token (Token (Literal, _, _)) = false
   207   | valued_token (Token (EOF, _, _)) = false
   208   | valued_token _ = true;
   209 
   210 
   211 (* predef_term *)
   212 
   213 fun predef_term s =
   214   (case AList.lookup (op =) terminal_kinds s of
   215     SOME sy => SOME (Token (sy, s, Position.no_range))
   216   | NONE => NONE);
   217 
   218 
   219 (* xstr tokens *)
   220 
   221 val scan_chr =
   222   $$$ "\\" |-- $$$ "'" ||
   223   Scan.one ((fn s => s <> "\\" andalso s <> "'" andalso Symbol.is_regular s) o symbol) >> single ||
   224   $$$ "'" --| Scan.ahead (~$$$ "'");
   225 
   226 val scan_str =
   227   $$$ "'" @@@ $$$ "'" @@@ !!! "missing end of string"
   228     ((Scan.repeat scan_chr >> flat) @@@ $$$ "'" @@@ $$$ "'");
   229 
   230 val scan_str_body =
   231   $$$ "'" |-- $$$ "'" |-- !!! "missing end of string"
   232     ((Scan.repeat scan_chr >> flat) --| $$$ "'" --| $$$ "'");
   233 
   234 
   235 fun implode_xstr cs = enclose "''" "''" (implode (map (fn "'" => "\\'" | c => c) cs));
   236 
   237 fun explode_xstr str =
   238   (case Scan.read Symbol_Pos.stopper scan_str_body (Symbol_Pos.explode (str, Position.none)) of
   239     SOME cs => map symbol cs
   240   | _ => error ("Inner lexical error: literal string expected at " ^ quote str));
   241 
   242 
   243 
   244 (** tokenize **)
   245 
   246 fun token_leq (Token (_, s1, _), Token (_, s2, _)) = s1 <= s2;
   247 fun token kind ss = Token (kind, Symbol_Pos.content ss, Symbol_Pos.range ss);
   248 
   249 fun tokenize lex xids syms =
   250   let
   251     val scan_xid =
   252       if xids then $$$ "_" @@@ scan_id || scan_id
   253       else scan_id;
   254 
   255     val scan_num = scan_hex || scan_bin || scan_int;
   256 
   257     val scan_val =
   258       scan_tvar >> token TVarSy ||
   259       scan_var >> token VarSy ||
   260       scan_tid >> token TFreeSy ||
   261       scan_float >> token FloatSy ||
   262       scan_num >> token NumSy ||
   263       $$$ "#" @@@ scan_num >> token XNumSy ||
   264       scan_longid >> token LongIdentSy ||
   265       scan_xid >> token IdentSy;
   266 
   267     val scan_lit = Scan.literal lex >> token Literal;
   268 
   269     val scan_token =
   270       Symbol_Pos.scan_comment !!! >> token Comment ||
   271       Scan.max token_leq scan_lit scan_val ||
   272       scan_str >> token StrSy ||
   273       Scan.many1 (Symbol.is_blank o symbol) >> token Space;
   274   in
   275     (case Scan.error
   276         (Scan.finite Symbol_Pos.stopper (Scan.repeat scan_token)) syms of
   277       (toks, []) => toks
   278     | (_, ss) => error ("Inner lexical error at: " ^ Symbol_Pos.content ss ^
   279         Position.str_of (#1 (Symbol_Pos.range ss))))
   280   end;
   281 
   282 
   283 
   284 (** scan variables **)
   285 
   286 (* scan_indexname *)
   287 
   288 local
   289 
   290 val scan_vname =
   291   let
   292     fun nat n [] = n
   293       | nat n (c :: cs) = nat (n * 10 + (ord c - ord "0")) cs;
   294 
   295     fun idxname cs ds = (implode (rev cs), nat 0 ds);
   296     fun chop_idx [] ds = idxname [] ds
   297       | chop_idx (cs as (_ :: "\\<^isub>" :: _)) ds = idxname cs ds
   298       | chop_idx (cs as (_ :: "\\<^isup>" :: _)) ds = idxname cs ds
   299       | chop_idx (c :: cs) ds =
   300           if Symbol.is_digit c then chop_idx cs (c :: ds)
   301           else idxname (c :: cs) ds;
   302 
   303     val scan = (scan_id >> map symbol) --
   304       Scan.optional ($$$ "." |-- scan_nat >> (nat 0 o map symbol)) ~1;
   305   in
   306     scan >>
   307       (fn (cs, ~1) => chop_idx (rev cs) []
   308         | (cs, i) => (implode cs, i))
   309   end;
   310 
   311 in
   312 
   313 val scan_indexname = $$$ "'" |-- scan_vname >> (fn (x, i) => ("'" ^ x, i)) || scan_vname;
   314 
   315 end;
   316 
   317 
   318 (* indexname *)
   319 
   320 fun read_indexname s =
   321   (case Scan.read Symbol_Pos.stopper scan_indexname (Symbol_Pos.explode (s, Position.none)) of
   322     SOME xi => xi
   323   | _ => error ("Lexical error in variable name: " ^ quote s));
   324 
   325 
   326 (* read_var *)
   327 
   328 fun const c = Const (c, dummyT);
   329 fun free x = Free (x, dummyT);
   330 fun var xi = Var (xi, dummyT);
   331 
   332 fun read_var str =
   333   let
   334     val scan =
   335       $$$ "?" |-- scan_indexname --| Scan.ahead (Scan.one Symbol_Pos.is_eof) >> var ||
   336       Scan.many (Symbol.is_regular o symbol) >> (free o implode o map symbol);
   337   in the (Scan.read Symbol_Pos.stopper scan (Symbol_Pos.explode (str, Position.none))) end;
   338 
   339 
   340 (* read_variable *)
   341 
   342 fun read_variable str =
   343   let val scan = $$$ "?" |-- scan_indexname || scan_indexname
   344   in Scan.read Symbol_Pos.stopper scan (Symbol_Pos.explode (str, Position.none)) end;
   345 
   346 
   347 (* logical entities *)
   348 
   349 fun marker s = (prefix s, unprefix s);
   350 
   351 val (mark_class, unmark_class) = marker "\\<^class>";
   352 val (mark_type, unmark_type) = marker "\\<^type>";
   353 val (mark_const, unmark_const) = marker "\\<^const>";
   354 val (mark_fixed, unmark_fixed) = marker "\\<^fixed>";
   355 
   356 fun unmark {case_class, case_type, case_const, case_fixed, case_default} s =
   357   (case try unmark_class s of
   358     SOME c => case_class c
   359   | NONE =>
   360       (case try unmark_type s of
   361         SOME c => case_type c
   362       | NONE =>
   363           (case try unmark_const s of
   364             SOME c => case_const c
   365           | NONE =>
   366               (case try unmark_fixed s of
   367                 SOME c => case_fixed c
   368               | NONE => case_default s))));
   369 
   370 val is_marked =
   371   unmark {case_class = K true, case_type = K true, case_const = K true,
   372     case_fixed = K true, case_default = K false};
   373 
   374 
   375 (* read numbers *)
   376 
   377 local
   378 
   379 fun nat cs =
   380   Option.map (#1 o Library.read_int o map symbol)
   381     (Scan.read Symbol_Pos.stopper scan_nat cs);
   382 
   383 in
   384 
   385 fun read_nat s = nat (Symbol_Pos.explode (s, Position.none));
   386 
   387 fun read_int s =
   388   (case Symbol_Pos.explode (s, Position.none) of
   389     ("-", _) :: cs => Option.map ~ (nat cs)
   390   | cs => nat cs);
   391 
   392 end;
   393 
   394 
   395 (* read_xnum: hex/bin/decimal *)
   396 
   397 local
   398 
   399 val ten = ord "0" + 10;
   400 val a = ord "a";
   401 val A = ord "A";
   402 val _ = a > A orelse raise Fail "Bad ASCII";
   403 
   404 fun remap_hex c =
   405   let val x = ord c in
   406     if x >= a then chr (x - a + ten)
   407     else if x >= A then chr (x - A + ten)
   408     else c
   409   end;
   410 
   411 fun leading_zeros ["0"] = 0
   412   | leading_zeros ("0" :: cs) = 1 + leading_zeros cs
   413   | leading_zeros _ = 0;
   414 
   415 in
   416 
   417 fun read_xnum str =
   418   let
   419     val (sign, radix, digs) =
   420       (case Symbol.explode (perhaps (try (unprefix "#")) str) of
   421         "0" :: "x" :: cs => (1, 16, map remap_hex cs)
   422       | "0" :: "b" :: cs => (1, 2, cs)
   423       | "-" :: cs => (~1, 10, cs)
   424       | cs => (1, 10, cs));
   425   in
   426    {radix = radix,
   427     leading_zeros = leading_zeros digs,
   428     value = sign * #1 (Library.read_radix_int radix digs)}
   429   end;
   430 
   431 end;
   432 
   433 fun read_float str =
   434   let
   435     val (sign, cs) =
   436       (case Symbol.explode str of
   437         "-" :: cs => (~1, cs)
   438       | cs => (1, cs));
   439     val (intpart, fracpart) =
   440       (case take_prefix Symbol.is_digit cs of
   441         (intpart, "." :: fracpart) => (intpart, fracpart)
   442       | _ => raise Fail "read_float");
   443   in
   444    {mant = sign * #1 (Library.read_int (intpart @ fracpart)),
   445     exp = length fracpart}
   446   end
   447 
   448 end;