src/Pure/Syntax/lexicon.ML
author wenzelm
Sat, 23 Jul 2011 16:37:17 +0200
changeset 44818 9b00f09f7721
parent 44304 224006e5ac46
child 45583 fe319b45315c
permissions -rw-r--r--
defer evaluation of Scan.message, for improved performance in the frequent situation where failure is handled later (e.g. via ||);
     1 (*  Title:      Pure/Syntax/lexicon.ML
     2     Author:     Tobias Nipkow and Markus Wenzel, TU Muenchen
     3 
     4 Lexer for the inner Isabelle syntax (terms and types).
     5 *)
     6 
     7 signature LEXICON =
     8 sig
     9   structure Syntax:
    10   sig
    11     val const: string -> term
    12     val free: string -> term
    13     val var: indexname -> term
    14   end
    15   val is_identifier: string -> bool
    16   val is_ascii_identifier: string -> bool
    17   val is_xid: string -> bool
    18   val is_tid: string -> bool
    19   val scan_id: Symbol_Pos.T list -> Symbol_Pos.T list * Symbol_Pos.T list
    20   val scan_longid: Symbol_Pos.T list -> Symbol_Pos.T list * Symbol_Pos.T list
    21   val scan_tid: Symbol_Pos.T list -> Symbol_Pos.T list * Symbol_Pos.T list
    22   val scan_nat: Symbol_Pos.T list -> Symbol_Pos.T list * Symbol_Pos.T list
    23   val scan_int: Symbol_Pos.T list -> Symbol_Pos.T list * Symbol_Pos.T list
    24   val scan_float: Symbol_Pos.T list -> Symbol_Pos.T list * Symbol_Pos.T list
    25   val scan_hex: Symbol_Pos.T list -> Symbol_Pos.T list * Symbol_Pos.T list
    26   val scan_bin: Symbol_Pos.T list -> Symbol_Pos.T list * Symbol_Pos.T list
    27   val scan_var: Symbol_Pos.T list -> Symbol_Pos.T list * Symbol_Pos.T list
    28   val scan_tvar: Symbol_Pos.T list -> Symbol_Pos.T list * Symbol_Pos.T list
    29   datatype token_kind =
    30     Literal | IdentSy | LongIdentSy | VarSy | TFreeSy | TVarSy |
    31     NumSy | FloatSy | XNumSy | StrSy | Space | Comment | EOF
    32   datatype token = Token of token_kind * string * Position.range
    33   val str_of_token: token -> string
    34   val pos_of_token: token -> Position.T
    35   val is_proper: token -> bool
    36   val mk_eof: Position.T -> token
    37   val eof: token
    38   val is_eof: token -> bool
    39   val stopper: token Scan.stopper
    40   val idT: typ
    41   val longidT: typ
    42   val varT: typ
    43   val tidT: typ
    44   val tvarT: typ
    45   val terminals: string list
    46   val is_terminal: string -> bool
    47   val report_token: Proof.context -> token -> unit
    48   val reported_token_range: Proof.context -> token -> string
    49   val matching_tokens: token * token -> bool
    50   val valued_token: token -> bool
    51   val predef_term: string -> token option
    52   val implode_xstr: string list -> string
    53   val explode_xstr: string -> string list
    54   val tokenize: Scan.lexicon -> bool -> Symbol_Pos.T list -> token list
    55   val read_indexname: string -> indexname
    56   val read_var: string -> term
    57   val read_variable: string -> indexname option
    58   val read_nat: string -> int option
    59   val read_int: string -> int option
    60   val read_xnum: string -> {radix: int, leading_zeros: int, value: int}
    61   val read_float: string -> {mant: int, exp: int}
    62   val mark_class: string -> string val unmark_class: string -> string
    63   val mark_type: string -> string val unmark_type: string -> string
    64   val mark_const: string -> string val unmark_const: string -> string
    65   val mark_fixed: string -> string val unmark_fixed: string -> string
    66   val unmark:
    67    {case_class: string -> 'a,
    68     case_type: string -> 'a,
    69     case_const: string -> 'a,
    70     case_fixed: string -> 'a,
    71     case_default: string -> 'a} -> string -> 'a
    72   val is_marked: string -> bool
    73   val dummy_type: term
    74   val fun_type: term
    75 end;
    76 
    77 structure Lexicon: LEXICON =
    78 struct
    79 
    80 (** syntaxtic terms **)
    81 
    82 structure Syntax =
    83 struct
    84 
    85 fun const c = Const (c, dummyT);
    86 fun free x = Free (x, dummyT);
    87 fun var xi = Var (xi, dummyT);
    88 
    89 end;
    90 
    91 
    92 
    93 (** is_identifier etc. **)
    94 
    95 val is_identifier = Symbol.is_ident o Symbol.explode;
    96 
    97 fun is_ascii_identifier s =
    98   let val cs = Symbol.explode s
    99   in forall Symbol.is_ascii cs andalso Symbol.is_ident cs end;
   100 
   101 fun is_xid s =
   102   (case Symbol.explode s of
   103     "_" :: cs => Symbol.is_ident cs
   104   | cs => Symbol.is_ident cs);
   105 
   106 fun is_tid s =
   107   (case Symbol.explode s of
   108     "'" :: cs => Symbol.is_ident cs
   109   | _ => false);
   110 
   111 
   112 
   113 (** basic scanners **)
   114 
   115 open Basic_Symbol_Pos;
   116 
   117 fun !!! msg = Symbol_Pos.!!! (fn () => "Inner lexical error: " ^ msg);
   118 
   119 val scan_id =
   120   Scan.one (Symbol.is_letter o Symbol_Pos.symbol) :::
   121   Scan.many (Symbol.is_letdig o Symbol_Pos.symbol);
   122 
   123 val scan_longid = scan_id @@@ (Scan.repeat1 ($$$ "." @@@ scan_id) >> flat);
   124 val scan_tid = $$$ "'" @@@ scan_id;
   125 
   126 val scan_nat = Scan.many1 (Symbol.is_digit o Symbol_Pos.symbol);
   127 val scan_int = $$$ "-" @@@ scan_nat || scan_nat;
   128 val scan_natdot = scan_nat @@@ $$$ "." @@@ scan_nat;
   129 val scan_float = $$$ "-" @@@ scan_natdot || scan_natdot;
   130 val scan_hex = $$$ "0" @@@ $$$ "x" @@@ Scan.many1 (Symbol.is_ascii_hex o Symbol_Pos.symbol);
   131 val scan_bin = $$$ "0" @@@ $$$ "b" @@@ Scan.many1 (fn (s, _) => s = "0" orelse s = "1");
   132 
   133 val scan_id_nat = scan_id @@@ Scan.optional ($$$ "." @@@ scan_nat) [];
   134 val scan_var = $$$ "?" @@@ scan_id_nat;
   135 val scan_tvar = $$$ "?" @@@ $$$ "'" @@@ scan_id_nat;
   136 
   137 
   138 
   139 (** datatype token **)
   140 
   141 datatype token_kind =
   142   Literal | IdentSy | LongIdentSy | VarSy | TFreeSy | TVarSy |
   143   NumSy | FloatSy | XNumSy | StrSy | Space | Comment | EOF;
   144 
   145 datatype token = Token of token_kind * string * Position.range;
   146 
   147 fun str_of_token (Token (_, s, _)) = s;
   148 fun pos_of_token (Token (_, _, (pos, _))) = pos;
   149 
   150 fun is_proper (Token (Space, _, _)) = false
   151   | is_proper (Token (Comment, _, _)) = false
   152   | is_proper _ = true;
   153 
   154 
   155 (* stopper *)
   156 
   157 fun mk_eof pos = Token (EOF, "", (pos, Position.none));
   158 val eof = mk_eof Position.none;
   159 
   160 fun is_eof (Token (EOF, _, _)) = true
   161   | is_eof _ = false;
   162 
   163 val stopper = Scan.stopper (K eof) is_eof;
   164 
   165 
   166 (* terminal arguments *)
   167 
   168 val idT = Type ("id", []);
   169 val longidT = Type ("longid", []);
   170 val varT = Type ("var", []);
   171 val tidT = Type ("tid", []);
   172 val tvarT = Type ("tvar", []);
   173 
   174 val terminal_kinds =
   175  [("id", IdentSy),
   176   ("longid", LongIdentSy),
   177   ("var", VarSy),
   178   ("tid", TFreeSy),
   179   ("tvar", TVarSy),
   180   ("num", NumSy),
   181   ("float_token", FloatSy),
   182   ("xnum", XNumSy),
   183   ("xstr", StrSy)];
   184 
   185 val terminals = map #1 terminal_kinds;
   186 val is_terminal = member (op =) terminals;
   187 
   188 
   189 (* markup *)
   190 
   191 val token_kind_markup =
   192  fn Literal     => Markup.literal
   193   | IdentSy     => Markup.ident
   194   | LongIdentSy => Markup.ident
   195   | VarSy       => Markup.var
   196   | TFreeSy     => Markup.tfree
   197   | TVarSy      => Markup.tvar
   198   | NumSy       => Markup.numeral
   199   | FloatSy     => Markup.numeral
   200   | XNumSy      => Markup.numeral
   201   | StrSy       => Markup.inner_string
   202   | Space       => Markup.empty
   203   | Comment     => Markup.inner_comment
   204   | EOF         => Markup.empty;
   205 
   206 fun report_token ctxt (Token (kind, s, (pos, _))) =
   207   let val markup =
   208     if kind = Literal andalso not (is_ascii_identifier s) then Markup.delimiter
   209     else token_kind_markup kind
   210   in Context_Position.report ctxt pos markup end;
   211 
   212 fun reported_token_range ctxt tok =
   213   if is_proper tok
   214   then Context_Position.reported_text ctxt (pos_of_token tok) Markup.token_range ""
   215   else "";
   216 
   217 
   218 (* matching_tokens *)
   219 
   220 fun matching_tokens (Token (Literal, x, _), Token (Literal, y, _)) = x = y
   221   | matching_tokens (Token (k, _, _), Token (k', _, _)) = k = k';
   222 
   223 
   224 (* valued_token *)
   225 
   226 fun valued_token (Token (Literal, _, _)) = false
   227   | valued_token (Token (EOF, _, _)) = false
   228   | valued_token _ = true;
   229 
   230 
   231 (* predef_term *)
   232 
   233 fun predef_term s =
   234   (case AList.lookup (op =) terminal_kinds s of
   235     SOME sy => SOME (Token (sy, s, Position.no_range))
   236   | NONE => NONE);
   237 
   238 
   239 (* xstr tokens *)
   240 
   241 val scan_chr =
   242   $$$ "\\" |-- $$$ "'" ||
   243   Scan.one
   244     ((fn s => s <> "\\" andalso s <> "'" andalso Symbol.is_regular s) o
   245       Symbol_Pos.symbol) >> single ||
   246   $$$ "'" --| Scan.ahead (~$$$ "'");
   247 
   248 val scan_str =
   249   $$$ "'" @@@ $$$ "'" @@@ !!! "missing end of string"
   250     ((Scan.repeat scan_chr >> flat) @@@ $$$ "'" @@@ $$$ "'");
   251 
   252 val scan_str_body =
   253   $$$ "'" |-- $$$ "'" |-- !!! "missing end of string"
   254     ((Scan.repeat scan_chr >> flat) --| $$$ "'" --| $$$ "'");
   255 
   256 
   257 fun implode_xstr cs = enclose "''" "''" (implode (map (fn "'" => "\\'" | c => c) cs));
   258 
   259 fun explode_xstr str =
   260   (case Scan.read Symbol_Pos.stopper scan_str_body (Symbol_Pos.explode (str, Position.none)) of
   261     SOME cs => map Symbol_Pos.symbol cs
   262   | _ => error ("Inner lexical error: literal string expected at " ^ quote str));
   263 
   264 
   265 
   266 (** tokenize **)
   267 
   268 fun token_leq (Token (_, s1, _), Token (_, s2, _)) = s1 <= s2;
   269 fun token kind ss = Token (kind, Symbol_Pos.content ss, Symbol_Pos.range ss);
   270 
   271 fun tokenize lex xids syms =
   272   let
   273     val scan_xid =
   274       if xids then $$$ "_" @@@ scan_id || scan_id
   275       else scan_id;
   276 
   277     val scan_num = scan_hex || scan_bin || scan_int;
   278 
   279     val scan_val =
   280       scan_tvar >> token TVarSy ||
   281       scan_var >> token VarSy ||
   282       scan_tid >> token TFreeSy ||
   283       scan_float >> token FloatSy ||
   284       scan_num >> token NumSy ||
   285       $$$ "#" @@@ scan_num >> token XNumSy ||
   286       scan_longid >> token LongIdentSy ||
   287       scan_xid >> token IdentSy;
   288 
   289     val scan_lit = Scan.literal lex >> token Literal;
   290 
   291     val scan_token =
   292       Symbol_Pos.scan_comment !!! >> token Comment ||
   293       Scan.max token_leq scan_lit scan_val ||
   294       scan_str >> token StrSy ||
   295       Scan.many1 (Symbol.is_blank o Symbol_Pos.symbol) >> token Space;
   296   in
   297     (case Scan.error
   298         (Scan.finite Symbol_Pos.stopper (Scan.repeat scan_token)) syms of
   299       (toks, []) => toks
   300     | (_, ss) => error ("Inner lexical error at: " ^ Symbol_Pos.content ss ^
   301         Position.str_of (#1 (Symbol_Pos.range ss))))
   302   end;
   303 
   304 
   305 
   306 (** scan variables **)
   307 
   308 (* scan_indexname *)
   309 
   310 local
   311 
   312 val scan_vname =
   313   let
   314     fun nat n [] = n
   315       | nat n (c :: cs) = nat (n * 10 + (ord c - ord "0")) cs;
   316 
   317     fun idxname cs ds = (implode (rev cs), nat 0 ds);
   318     fun chop_idx [] ds = idxname [] ds
   319       | chop_idx (cs as (_ :: "\\<^isub>" :: _)) ds = idxname cs ds
   320       | chop_idx (cs as (_ :: "\\<^isup>" :: _)) ds = idxname cs ds
   321       | chop_idx (c :: cs) ds =
   322           if Symbol.is_digit c then chop_idx cs (c :: ds)
   323           else idxname (c :: cs) ds;
   324 
   325     val scan = (scan_id >> map Symbol_Pos.symbol) --
   326       Scan.optional ($$$ "." |-- scan_nat >> (nat 0 o map Symbol_Pos.symbol)) ~1;
   327   in
   328     scan >>
   329       (fn (cs, ~1) => chop_idx (rev cs) []
   330         | (cs, i) => (implode cs, i))
   331   end;
   332 
   333 in
   334 
   335 val scan_indexname = $$$ "'" |-- scan_vname >> (fn (x, i) => ("'" ^ x, i)) || scan_vname;
   336 
   337 end;
   338 
   339 
   340 (* indexname *)
   341 
   342 fun read_indexname s =
   343   (case Scan.read Symbol_Pos.stopper scan_indexname (Symbol_Pos.explode (s, Position.none)) of
   344     SOME xi => xi
   345   | _ => error ("Lexical error in variable name: " ^ quote s));
   346 
   347 
   348 (* read_var *)
   349 
   350 fun read_var str =
   351   let
   352     val scan =
   353       $$$ "?" |-- scan_indexname --| Scan.ahead (Scan.one Symbol_Pos.is_eof)
   354         >> Syntax.var ||
   355       Scan.many (Symbol.is_regular o Symbol_Pos.symbol)
   356         >> (Syntax.free o implode o map Symbol_Pos.symbol);
   357   in the (Scan.read Symbol_Pos.stopper scan (Symbol_Pos.explode (str, Position.none))) end;
   358 
   359 
   360 (* read_variable *)
   361 
   362 fun read_variable str =
   363   let val scan = $$$ "?" |-- scan_indexname || scan_indexname
   364   in Scan.read Symbol_Pos.stopper scan (Symbol_Pos.explode (str, Position.none)) end;
   365 
   366 
   367 (* read numbers *)
   368 
   369 local
   370 
   371 fun nat cs =
   372   Option.map (#1 o Library.read_int o map Symbol_Pos.symbol)
   373     (Scan.read Symbol_Pos.stopper scan_nat cs);
   374 
   375 in
   376 
   377 fun read_nat s = nat (Symbol_Pos.explode (s, Position.none));
   378 
   379 fun read_int s =
   380   (case Symbol_Pos.explode (s, Position.none) of
   381     ("-", _) :: cs => Option.map ~ (nat cs)
   382   | cs => nat cs);
   383 
   384 end;
   385 
   386 
   387 (* read_xnum: hex/bin/decimal *)
   388 
   389 local
   390 
   391 val ten = ord "0" + 10;
   392 val a = ord "a";
   393 val A = ord "A";
   394 val _ = a > A orelse raise Fail "Bad ASCII";
   395 
   396 fun remap_hex c =
   397   let val x = ord c in
   398     if x >= a then chr (x - a + ten)
   399     else if x >= A then chr (x - A + ten)
   400     else c
   401   end;
   402 
   403 fun leading_zeros ["0"] = 0
   404   | leading_zeros ("0" :: cs) = 1 + leading_zeros cs
   405   | leading_zeros _ = 0;
   406 
   407 in
   408 
   409 fun read_xnum str =
   410   let
   411     val (sign, radix, digs) =
   412       (case Symbol.explode (perhaps (try (unprefix "#")) str) of
   413         "0" :: "x" :: cs => (1, 16, map remap_hex cs)
   414       | "0" :: "b" :: cs => (1, 2, cs)
   415       | "-" :: cs => (~1, 10, cs)
   416       | cs => (1, 10, cs));
   417   in
   418    {radix = radix,
   419     leading_zeros = leading_zeros digs,
   420     value = sign * #1 (Library.read_radix_int radix digs)}
   421   end;
   422 
   423 end;
   424 
   425 fun read_float str =
   426   let
   427     val (sign, cs) =
   428       (case Symbol.explode str of
   429         "-" :: cs => (~1, cs)
   430       | cs => (1, cs));
   431     val (intpart, fracpart) =
   432       (case take_prefix Symbol.is_digit cs of
   433         (intpart, "." :: fracpart) => (intpart, fracpart)
   434       | _ => raise Fail "read_float");
   435   in
   436    {mant = sign * #1 (Library.read_int (intpart @ fracpart)),
   437     exp = length fracpart}
   438   end;
   439 
   440 
   441 (* marked logical entities *)
   442 
   443 fun marker s = (prefix s, unprefix s);
   444 
   445 val (mark_class, unmark_class) = marker "\\<^class>";
   446 val (mark_type, unmark_type) = marker "\\<^type>";
   447 val (mark_const, unmark_const) = marker "\\<^const>";
   448 val (mark_fixed, unmark_fixed) = marker "\\<^fixed>";
   449 
   450 fun unmark {case_class, case_type, case_const, case_fixed, case_default} s =
   451   (case try unmark_class s of
   452     SOME c => case_class c
   453   | NONE =>
   454       (case try unmark_type s of
   455         SOME c => case_type c
   456       | NONE =>
   457           (case try unmark_const s of
   458             SOME c => case_const c
   459           | NONE =>
   460               (case try unmark_fixed s of
   461                 SOME c => case_fixed c
   462               | NONE => case_default s))));
   463 
   464 val is_marked =
   465   unmark {case_class = K true, case_type = K true, case_const = K true,
   466     case_fixed = K true, case_default = K false};
   467 
   468 val dummy_type = Syntax.const (mark_type "dummy");
   469 val fun_type = Syntax.const (mark_type "fun");
   470 
   471 end;