src/Pure/Syntax/parser.ML
author wenzelm
Mon, 04 Apr 2011 22:21:36 +0200
changeset 43101 8e0a4d500e5b
parent 43100 1a2a53d03c31
child 43102 19c23372c686
permissions -rw-r--r--
streamlined token list operations, assuming that the order of union does not matter;
     1 (*  Title:      Pure/Syntax/parser.ML
     2     Author:     Carsten Clasohm, Sonia Mahjoub, and Markus Wenzel, TU Muenchen
     3 
     4 General context-free parser for the inner syntax of terms, types, etc.
     5 *)
     6 
     7 signature PARSER =
     8 sig
     9   type gram
    10   val empty_gram: gram
    11   val extend_gram: Syn_Ext.xprod list -> gram -> gram
    12   val merge_gram: gram * gram -> gram
    13   val pretty_gram: gram -> Pretty.T list
    14   datatype parsetree =
    15     Node of string * parsetree list |
    16     Tip of Lexicon.token
    17   val pretty_parsetree: parsetree -> Pretty.T
    18   val parse: Proof.context -> gram -> string -> Lexicon.token list -> parsetree list
    19   val guess_infix_lr: gram -> string -> (string * bool * bool * int) option
    20   val branching_level: int Config.T
    21 end;
    22 
    23 structure Parser: PARSER =
    24 struct
    25 
    26 (** datatype gram **)
    27 
    28 (*production for the NTs are stored in a vector
    29   so we can identify NTs by their index*)
    30 type nt_tag = int;
    31 
    32 datatype symb =
    33   Terminal of Lexicon.token
    34 | Nonterminal of nt_tag * int;  (*(tag, precedence)*)
    35 
    36 type nt_gram =
    37   ((nt_tag list * Lexicon.token list) *
    38     (Lexicon.token option * (symb list * string * int) list) list);
    39   (*(([dependent_nts], [start_tokens]), [(start_token, [(rhs, name, prio)])])*)
    40   (*depent_nts is a list of all NTs whose lookahead depends on this NT's lookahead*)
    41 
    42 datatype gram =
    43   Gram of
    44    {nt_count: int,
    45     prod_count: int,
    46     tags: nt_tag Symtab.table,
    47     chains: (nt_tag * nt_tag list) list,  (*[(to, [from])]*)
    48     lambdas: nt_tag list,
    49     prods: nt_gram Vector.vector};
    50     (*"tags" is used to map NT names (i.e. strings) to tags;
    51      chain productions are not stored as normal productions
    52      but instead as an entry in "chains";
    53      lambda productions are stored as normal productions
    54      and also as an entry in "lambdas"*)
    55 
    56 val union_token = union Lexicon.matching_tokens;
    57 val subtract_token = subtract Lexicon.matching_tokens;
    58 
    59 (*productions for which no starting token is
    60   known yet are associated with this token*)
    61 val unknown_start = Lexicon.eof;
    62 
    63 (*get all NTs that are connected with a list of NTs*)
    64 fun connected_with _ ([]: nt_tag list) relatives = relatives
    65   | connected_with chains (root :: roots) relatives =
    66       let val branches = subtract (op =) relatives (these (AList.lookup (op =) chains root));
    67       in connected_with chains (branches @ roots) (branches @ relatives) end;
    68 
    69 (*convert productions to grammar;
    70   N.B. that the chains parameter has the form [(from, [to])];
    71   prod_count is of type "int option" and is only updated if it is <> NONE*)
    72 fun add_prods _ chains lambdas prod_count [] = (chains, lambdas, prod_count)
    73   | add_prods prods chains lambdas prod_count ((lhs, new_prod as (rhs, name, pri)) :: ps) =
    74       let
    75         val chain_from =
    76           (case (pri, rhs) of
    77             (~1, [Nonterminal (id, ~1)]) => SOME id
    78           | _ => NONE);
    79 
    80         (*store chain if it does not already exist*)
    81         val (new_chain, chains') =
    82           (case chain_from of
    83             NONE => (NONE, chains)
    84           | SOME from =>
    85               let val old_tos = these (AList.lookup (op =) chains from) in
    86                 if member (op =) old_tos lhs then (NONE, chains)
    87                 else (SOME from, AList.update (op =) (from, insert (op =) lhs old_tos) chains)
    88               end);
    89 
    90         (*propagate new chain in lookahead and lambda lists;
    91           added_starts is used later to associate existing
    92           productions with new starting tokens*)
    93         val (added_starts, lambdas') =
    94           if is_none new_chain then ([], lambdas)
    95           else
    96             let (*lookahead of chain's source*)
    97               val ((from_nts, from_tks), _) = Array.sub (prods, the new_chain);
    98 
    99               (*copy from's lookahead to chain's destinations*)
   100               fun copy_lookahead [] added = added
   101                 | copy_lookahead (to :: tos) added =
   102                     let
   103                       val ((to_nts, to_tks), ps) = Array.sub (prods, to);
   104 
   105                       val new_tks = subtract (op =) to_tks from_tks;  (*added lookahead tokens*)
   106                       val _ = Array.update (prods, to, ((to_nts, to_tks @ new_tks), ps));
   107                     in
   108                       copy_lookahead tos (if null new_tks then added else (to, new_tks) :: added)
   109                     end;
   110 
   111               val tos = connected_with chains' [lhs] [lhs];
   112             in
   113               (copy_lookahead tos [],
   114                 union (op =) (if member (op =) lambdas lhs then tos else []) lambdas)
   115             end;
   116 
   117         (*test if new production can produce lambda
   118           (rhs must either be empty or only consist of lambda NTs)*)
   119         val (new_lambda, lambdas') =
   120           if forall
   121             (fn Nonterminal (id, _) => member (op =) lambdas' id
   122               | Terminal _ => false) rhs
   123           then (true, union (op =) (connected_with chains' [lhs] [lhs]) lambdas')
   124           else (false, lambdas');
   125 
   126         (*list optional terminal and all nonterminals on which the lookahead
   127           of a production depends*)
   128         fun lookahead_dependency _ [] nts = (NONE, nts)
   129           | lookahead_dependency _ (Terminal tk :: _) nts = (SOME tk, nts)
   130           | lookahead_dependency lambdas (Nonterminal (nt, _) :: symbs) nts =
   131               if member (op =) lambdas nt then
   132                 lookahead_dependency lambdas symbs (nt :: nts)
   133               else (NONE, nt :: nts);
   134 
   135         (*get all known starting tokens for a nonterminal*)
   136         fun starts_for_nt nt = snd (fst (Array.sub (prods, nt)));
   137 
   138         (*update prods, lookaheads, and lambdas according to new lambda NTs*)
   139         val (added_starts', lambdas') =
   140           let
   141             (*propagate added lambda NT*)
   142             fun propagate_lambda [] added_starts lambdas = (added_starts, lambdas)
   143               | propagate_lambda (l :: ls) added_starts lambdas =
   144                   let
   145                     (*get lookahead for lambda NT*)
   146                     val ((dependent, l_starts), _) = Array.sub (prods, l);
   147 
   148                     (*check productions whose lookahead may depend on lambda NT*)
   149                     fun examine_prods [] add_lambda nt_dependencies added_tks nt_prods =
   150                           (add_lambda, nt_dependencies, added_tks, nt_prods)
   151                       | examine_prods ((p as (rhs, _, _)) :: ps) add_lambda
   152                             nt_dependencies added_tks nt_prods =
   153                           let val (tk, nts) = lookahead_dependency lambdas rhs [] in
   154                             if member (op =) nts l then       (*update production's lookahead*)
   155                               let
   156                                 val new_lambda = is_none tk andalso subset (op =) (nts, lambdas);
   157 
   158                                 val new_tks =
   159                                   (if is_some tk then [the tk] else [])
   160                                   |> fold (union_token o starts_for_nt) nts
   161                                   |> subtract (op =) l_starts;
   162 
   163                                 val added_tks' = union_token added_tks new_tks;
   164 
   165                                 val nt_dependencies' = union (op =) nts nt_dependencies;
   166 
   167                                 (*associate production with new starting tokens*)
   168                                 fun copy ([]: Lexicon.token option list) nt_prods = nt_prods
   169                                   | copy (tk :: tks) nt_prods =
   170                                       let
   171                                         val old_prods = these (AList.lookup (op =) nt_prods tk);
   172                                         val prods' = p :: old_prods;
   173                                       in
   174                                         nt_prods
   175                                         |> AList.update (op =) (tk, prods')
   176                                         |> copy tks
   177                                       end;
   178 
   179                                 val nt_prods' =
   180                                   let val new_opt_tks = map SOME new_tks in
   181                                     copy
   182                                       ((if new_lambda then [NONE] else []) @ new_opt_tks) nt_prods
   183                                   end;
   184                               in
   185                                 examine_prods ps (add_lambda orelse new_lambda)
   186                                   nt_dependencies' added_tks' nt_prods'
   187                               end
   188                             else (*skip production*)
   189                               examine_prods ps add_lambda nt_dependencies added_tks nt_prods
   190                           end;
   191 
   192                     (*check each NT whose lookahead depends on new lambda NT*)
   193                     fun process_nts [] added_lambdas added_starts =
   194                           (added_lambdas, added_starts)
   195                       | process_nts (nt :: nts) added_lambdas added_starts =
   196                           let
   197                             val (lookahead as (old_nts, old_tks), nt_prods) = Array.sub (prods, nt);
   198 
   199                             (*existing productions whose lookahead may depend on l*)
   200                             val tk_prods =
   201                               these
   202                                 (AList.lookup (op =) nt_prods
   203                                   (SOME (hd l_starts handle Empty => unknown_start)));
   204 
   205                             (*add_lambda is true if an existing production of the nt
   206                               produces lambda due to the new lambda NT l*)
   207                             val (add_lambda, nt_dependencies, added_tks, nt_prods') =
   208                               examine_prods tk_prods false [] [] nt_prods;
   209 
   210                             val added_nts = subtract (op =) old_nts nt_dependencies;
   211 
   212                             val added_lambdas' =
   213                               if add_lambda then nt :: added_lambdas
   214                               else added_lambdas;
   215                             val _ =
   216                               Array.update
   217                                 (prods, nt, ((added_nts @ old_nts, old_tks @ added_tks), nt_prods'));
   218                               (*N.B. that because the tks component
   219                                 is used to access existing
   220                                 productions we have to add new
   221                                 tokens at the _end_ of the list*)
   222                           in
   223                             if null added_tks then
   224                               process_nts nts added_lambdas' added_starts
   225                             else
   226                               process_nts nts added_lambdas' ((nt, added_tks) :: added_starts)
   227                           end;
   228 
   229                     val (added_lambdas, added_starts') = process_nts dependent [] added_starts;
   230                     val added_lambdas' = subtract (op =) lambdas added_lambdas;
   231                   in
   232                     propagate_lambda (ls @ added_lambdas') added_starts' (added_lambdas' @ lambdas)
   233                   end;
   234           in propagate_lambda (subtract (op =) lambdas lambdas') added_starts lambdas' end;
   235 
   236         (*insert production into grammar*)
   237         val (added_starts', prod_count') =
   238           if is_some chain_from
   239           then (added_starts', prod_count)  (*don't store chain production*)
   240           else
   241             let
   242               (*lookahead tokens of new production and on which
   243                 NTs lookahead depends*)
   244               val (start_tk, start_nts) = lookahead_dependency lambdas' rhs [];
   245 
   246               val start_tks =
   247                 (if is_some start_tk then [the start_tk] else [])
   248                 |> fold (union_token o starts_for_nt) start_nts;
   249 
   250               val opt_starts =
   251                (if new_lambda then [NONE]
   252                 else if null start_tks then [SOME unknown_start]
   253                 else []) @ map SOME start_tks;
   254 
   255               (*add lhs NT to list of dependent NTs in lookahead*)
   256               fun add_nts [] = ()
   257                 | add_nts (nt :: nts) =
   258                     let val ((old_nts, old_tks), ps) = Array.sub (prods, nt) in
   259                       if member (op =) old_nts lhs then ()
   260                       else Array.update (prods, nt, ((lhs :: old_nts, old_tks), ps))
   261                     end;
   262 
   263               (*add new start tokens to chained NTs' lookahead list;
   264                 also store new production for lhs NT*)
   265               fun add_tks [] added prod_count = (added, prod_count)
   266                 | add_tks (nt :: nts) added prod_count =
   267                     let
   268                       val ((old_nts, old_tks), nt_prods) = Array.sub (prods, nt);
   269 
   270                       val new_tks = subtract_token old_tks start_tks;
   271 
   272                       (*store new production*)
   273                       fun store [] prods is_new =
   274                             (prods,
   275                               if is_some prod_count andalso is_new then
   276                                 Option.map (fn x => x + 1) prod_count
   277                               else prod_count, is_new)
   278                         | store (tk :: tks) prods is_new =
   279                             let
   280                               val tk_prods = these (AList.lookup (op =) prods tk);
   281 
   282                               (*if prod_count = NONE then we can assume that
   283                                 grammar does not contain new production already*)
   284                               val (tk_prods', is_new') =
   285                                 if is_some prod_count then
   286                                   if member (op =) tk_prods new_prod then (tk_prods, false)
   287                                   else (new_prod :: tk_prods, true)
   288                                 else (new_prod :: tk_prods, true);
   289 
   290                               val prods' =
   291                                 if is_new' then
   292                                   AList.update (op =) (tk: Lexicon.token option, tk_prods') prods
   293                                 else prods;
   294                             in store tks prods' (is_new orelse is_new') end;
   295 
   296                       val (nt_prods', prod_count', changed) =
   297                         if nt = lhs
   298                         then store opt_starts nt_prods false
   299                         else (nt_prods, prod_count, false);
   300                       val _ =
   301                         if not changed andalso null new_tks then ()
   302                         else Array.update (prods, nt, ((old_nts, old_tks @ new_tks), nt_prods'));
   303                     in
   304                       add_tks nts
   305                         (if null new_tks then added else (nt, new_tks) :: added) prod_count'
   306                     end;
   307               val _ = add_nts start_nts;
   308             in
   309               add_tks (connected_with chains' [lhs] [lhs]) [] prod_count
   310             end;
   311 
   312         (*associate productions with new lookaheads*)
   313         val _ =
   314           let
   315             (*propagate added start tokens*)
   316             fun add_starts [] = ()
   317               | add_starts ((changed_nt, new_tks) :: starts) =
   318                   let
   319                     (*token under which old productions which
   320                       depend on changed_nt could be stored*)
   321                     val key =
   322                       (case find_first (not o member (op =) new_tks) (starts_for_nt changed_nt) of
   323                         NONE => SOME unknown_start
   324                       | t => t);
   325 
   326                     (*copy productions whose lookahead depends on changed_nt;
   327                       if key = SOME unknown_start then tk_prods is used to hold
   328                       the productions not copied*)
   329                     fun update_prods [] result = result
   330                       | update_prods ((p as (rhs, _: string, _: nt_tag)) :: ps)
   331                             (tk_prods, nt_prods) =
   332                           let
   333                             (*lookahead dependency for production*)
   334                             val (tk, depends) = lookahead_dependency lambdas' rhs [];
   335 
   336                             (*test if this production has to be copied*)
   337                             val update = member (op =) depends changed_nt;
   338 
   339                             (*test if production could already be associated with
   340                               a member of new_tks*)
   341                             val lambda =
   342                               length depends > 1 orelse
   343                               not (null depends) andalso is_some tk
   344                               andalso member (op =) new_tks (the tk);
   345 
   346                             (*associate production with new starting tokens*)
   347                             fun copy ([]: Lexicon.token list) nt_prods = nt_prods
   348                               | copy (tk :: tks) nt_prods =
   349                                  let
   350                                    val tk_prods = these (AList.lookup (op =) nt_prods (SOME tk));
   351 
   352                                    val tk_prods' =
   353                                      if not lambda then p :: tk_prods
   354                                      else insert (op =) p tk_prods;
   355                                      (*if production depends on lambda NT we
   356                                        have to look for duplicates*)
   357                                  in
   358                                    nt_prods
   359                                    |> AList.update (op =) (SOME tk, tk_prods')
   360                                    |> copy tks
   361                                  end;
   362                             val result =
   363                               if update then (tk_prods, copy new_tks nt_prods)
   364                               else if key = SOME unknown_start then (p :: tk_prods, nt_prods)
   365                               else (tk_prods, nt_prods);
   366                           in update_prods ps result end;
   367 
   368                     (*copy existing productions for new starting tokens*)
   369                     fun process_nts [] added = added
   370                       | process_nts (nt :: nts) added =
   371                           let
   372                             val (lookahead as (old_nts, old_tks), nt_prods) = Array.sub (prods, nt);
   373 
   374                             val tk_prods = these (AList.lookup (op =) nt_prods key);
   375 
   376                             (*associate productions with new lookahead tokens*)
   377                             val (tk_prods', nt_prods') = update_prods tk_prods ([], nt_prods);
   378 
   379                             val nt_prods'' =
   380                               if key = SOME unknown_start then
   381                                 AList.update (op =) (key, tk_prods') nt_prods'
   382                               else nt_prods';
   383 
   384                             val added_tks = subtract_token old_tks new_tks;
   385                           in
   386                             if null added_tks then
   387                               (Array.update (prods, nt, (lookahead, nt_prods''));
   388                                 process_nts nts added)
   389                             else
   390                               (Array.update (prods, nt, ((old_nts, added_tks @ old_tks), nt_prods''));
   391                                 process_nts nts ((nt, added_tks) :: added))
   392                           end;
   393 
   394                     val ((dependent, _), _) = Array.sub (prods, changed_nt);
   395                   in add_starts (starts @ process_nts dependent []) end;
   396           in add_starts added_starts' end;
   397       in add_prods prods chains' lambdas' prod_count ps end;
   398 
   399 
   400 (* pretty_gram *)
   401 
   402 fun pretty_gram (Gram {tags, prods, chains, ...}) =
   403   let
   404     fun pretty_name name = [Pretty.str (name ^ " =")];
   405 
   406     val nt_name = the o Inttab.lookup (Inttab.make (map swap (Symtab.dest tags)));
   407 
   408     fun pretty_symb (Terminal (Lexicon.Token (Lexicon.Literal, s, _))) = Pretty.quote (Pretty.str s)
   409       | pretty_symb (Terminal tok) = Pretty.str (Lexicon.str_of_token tok)
   410       | pretty_symb (Nonterminal (tag, p)) =
   411           Pretty.str (nt_name tag ^ "[" ^ signed_string_of_int p ^ "]");
   412 
   413     fun pretty_const "" = []
   414       | pretty_const c = [Pretty.str ("=> " ^ quote c)];
   415 
   416     fun pretty_pri p = [Pretty.str ("(" ^ signed_string_of_int p ^ ")")];
   417 
   418     fun pretty_prod name (symbs, const, pri) =
   419       Pretty.block (Pretty.breaks (pretty_name name @
   420         map pretty_symb symbs @ pretty_const const @ pretty_pri pri));
   421 
   422     fun pretty_nt (name, tag) =
   423       let
   424         fun prod_of_chain from = ([Nonterminal (from, ~1)], "", ~1);
   425 
   426         val nt_prods =
   427           fold (union (op =) o snd) (snd (Vector.sub (prods, tag))) [] @
   428             map prod_of_chain (these (AList.lookup (op =) chains tag));
   429       in map (pretty_prod name) nt_prods end;
   430 
   431   in maps pretty_nt (sort_wrt fst (Symtab.dest tags)) end;
   432 
   433 
   434 
   435 (** Operations on gramars **)
   436 
   437 val empty_gram =
   438   Gram
   439    {nt_count = 0,
   440     prod_count = 0,
   441     tags = Symtab.empty, chains = [],
   442     lambdas = [],
   443     prods = Vector.fromList [(([], []), [])]};
   444 
   445 
   446 (*Invert list of chain productions*)
   447 fun inverse_chains [] result = result
   448   | inverse_chains ((root, branches: nt_tag list) :: cs) result =
   449       let
   450         fun add ([]: nt_tag list) result = result
   451           | add (id :: ids) result =
   452               let val old = these (AList.lookup (op =) result id);
   453               in add ids (AList.update (op =) (id, root :: old) result) end;
   454       in inverse_chains cs (add branches result) end;
   455 
   456 
   457 (*Add productions to a grammar*)
   458 fun extend_gram [] gram = gram
   459   | extend_gram xprods (Gram {nt_count, prod_count, tags, chains, lambdas, prods}) =
   460       let
   461         (*Get tag for existing nonterminal or create a new one*)
   462         fun get_tag nt_count tags nt =
   463           (case Symtab.lookup tags nt of
   464             SOME tag => (nt_count, tags, tag)
   465           | NONE => (nt_count + 1, Symtab.update_new (nt, nt_count) tags, nt_count));
   466 
   467         (*Convert symbols to the form used by the parser;
   468           delimiters and predefined terms are stored as terminals,
   469           nonterminals are converted to integer tags*)
   470         fun symb_of [] nt_count tags result = (nt_count, tags, rev result)
   471           | symb_of (Syn_Ext.Delim s :: ss) nt_count tags result =
   472               symb_of ss nt_count tags
   473                 (Terminal (Lexicon.Token (Lexicon.Literal, s, Position.no_range)) :: result)
   474           | symb_of (Syn_Ext.Argument (s, p) :: ss) nt_count tags result =
   475               let
   476                 val (nt_count', tags', new_symb) =
   477                   (case Lexicon.predef_term s of
   478                     NONE =>
   479                       let val (nt_count', tags', s_tag) = get_tag nt_count tags s;
   480                       in (nt_count', tags', Nonterminal (s_tag, p)) end
   481                   | SOME tk => (nt_count, tags, Terminal tk));
   482               in symb_of ss nt_count' tags' (new_symb :: result) end
   483           | symb_of (_ :: ss) nt_count tags result = symb_of ss nt_count tags result;
   484 
   485         (*Convert list of productions by invoking symb_of for each of them*)
   486         fun prod_of [] nt_count prod_count tags result =
   487               (nt_count, prod_count, tags, result)
   488           | prod_of (Syn_Ext.XProd (lhs, xsymbs, const, pri) :: ps)
   489                 nt_count prod_count tags result =
   490               let
   491                 val (nt_count', tags', lhs_tag) = get_tag nt_count tags lhs;
   492                 val (nt_count'', tags'', prods) = symb_of xsymbs nt_count' tags' [];
   493               in
   494                 prod_of ps nt_count'' (prod_count + 1) tags''
   495                   ((lhs_tag, (prods, const, pri)) :: result)
   496               end;
   497 
   498         val (nt_count', prod_count', tags', xprods') =
   499           prod_of xprods nt_count prod_count tags [];
   500 
   501         (*Copy array containing productions of old grammar;
   502           this has to be done to preserve the old grammar while being able
   503           to change the array's content*)
   504         val prods' =
   505           let
   506             fun get_prod i =
   507               if i < nt_count then Vector.sub (prods, i)
   508               else (([], []), []);
   509           in Array.tabulate (nt_count', get_prod) end;
   510 
   511         val fromto_chains = inverse_chains chains [];
   512 
   513         (*Add new productions to old ones*)
   514         val (fromto_chains', lambdas', _) =
   515           add_prods prods' fromto_chains lambdas NONE xprods';
   516 
   517         val chains' = inverse_chains fromto_chains' [];
   518       in
   519         Gram
   520          {nt_count = nt_count',
   521           prod_count = prod_count',
   522           tags = tags',
   523           chains = chains',
   524           lambdas = lambdas',
   525           prods = Array.vector prods'}
   526       end;
   527 
   528 
   529 (*Merge two grammars*)
   530 fun merge_gram (gram_a, gram_b) =
   531   let
   532     (*find out which grammar is bigger*)
   533     val
   534       (Gram {nt_count = nt_count1, prod_count = prod_count1, tags = tags1,
   535         chains = chains1, lambdas = lambdas1, prods = prods1},
   536       Gram {nt_count = nt_count2, prod_count = prod_count2, tags = tags2,
   537         chains = chains2, lambdas = lambdas2, prods = prods2}) =
   538       let
   539         val Gram {prod_count = count_a, ...} = gram_a;
   540         val Gram {prod_count = count_b, ...} = gram_b;
   541       in
   542         if count_a > count_b
   543         then (gram_a, gram_b)
   544         else (gram_b, gram_a)
   545       end;
   546 
   547     (*get existing tag from grammar1 or create a new one*)
   548     fun get_tag nt_count tags nt =
   549       (case Symtab.lookup tags nt of
   550         SOME tag => (nt_count, tags, tag)
   551       | NONE => (nt_count + 1, Symtab.update_new (nt, nt_count) tags, nt_count));
   552 
   553     val ((nt_count1', tags1'), tag_table) =
   554       let
   555         val tag_list = Symtab.dest tags2;
   556 
   557         val table = Array.array (nt_count2, ~1);
   558 
   559         fun store_tag nt_count tags ~1 = (nt_count, tags)
   560           | store_tag nt_count tags tag =
   561               let
   562                 val (nt_count', tags', tag') =
   563                   get_tag nt_count tags (fst (the (find_first (fn (n, t) => t = tag) tag_list)));
   564                 val _ = Array.update (table, tag, tag');
   565               in store_tag nt_count' tags' (tag - 1) end;
   566       in (store_tag nt_count1 tags1 (nt_count2 - 1), table) end;
   567 
   568     (*convert grammar2 tag to grammar1 tag*)
   569     fun convert_tag tag = Array.sub (tag_table, tag);
   570 
   571     (*convert chain list to raw productions*)
   572     fun mk_chain_prods [] result = result
   573       | mk_chain_prods ((to, froms) :: cs) result =
   574           let
   575             val to_tag = convert_tag to;
   576 
   577             fun make [] result = result
   578               | make (from :: froms) result = make froms
   579                   ((to_tag, ([Nonterminal (convert_tag from, ~1)], "", ~1)) :: result);
   580           in mk_chain_prods cs (make froms [] @ result) end;
   581 
   582     val chain_prods = mk_chain_prods chains2 [];
   583 
   584     (*convert prods2 array to productions*)
   585     fun process_nt ~1 result = result
   586       | process_nt nt result =
   587           let
   588             val nt_prods = fold (union (op =) o snd) (snd (Vector.sub (prods2, nt))) [];
   589             val lhs_tag = convert_tag nt;
   590 
   591             (*convert tags in rhs*)
   592             fun process_rhs [] result = result
   593               | process_rhs (Terminal tk :: rhs) result =
   594                   process_rhs rhs (result @ [Terminal tk])
   595               | process_rhs (Nonterminal (nt, prec) :: rhs) result =
   596                   process_rhs rhs (result @ [Nonterminal (convert_tag nt, prec)]);
   597 
   598             (*convert tags in productions*)
   599             fun process_prods [] result = result
   600               | process_prods ((rhs, id, prec) :: ps) result =
   601                   process_prods ps ((lhs_tag, (process_rhs rhs [], id, prec)) :: result);
   602           in process_nt (nt - 1) (process_prods nt_prods [] @ result) end;
   603 
   604     val raw_prods = chain_prods @ process_nt (nt_count2 - 1) [];
   605 
   606     val prods1' =
   607       let
   608         fun get_prod i =
   609           if i < nt_count1 then Vector.sub (prods1, i)
   610           else (([], []), []);
   611       in Array.tabulate (nt_count1', get_prod) end;
   612 
   613     val fromto_chains = inverse_chains chains1 [];
   614 
   615     val (fromto_chains', lambdas', SOME prod_count1') =
   616       add_prods prods1' fromto_chains lambdas1 (SOME prod_count1) raw_prods;
   617 
   618     val chains' = inverse_chains fromto_chains' [];
   619   in
   620     Gram
   621      {nt_count = nt_count1',
   622       prod_count = prod_count1',
   623       tags = tags1',
   624       chains = chains',
   625       lambdas = lambdas',
   626       prods = Array.vector prods1'}
   627   end;
   628 
   629 
   630 
   631 (** Parser **)
   632 
   633 datatype parsetree =
   634   Node of string * parsetree list |
   635   Tip of Lexicon.token;
   636 
   637 fun pretty_parsetree (Node (c, pts)) =
   638       Pretty.enclose "(" ")" (Pretty.breaks
   639         (Pretty.quote (Pretty.str c) :: map pretty_parsetree pts))
   640   | pretty_parsetree (Tip tok) = Pretty.str (Lexicon.str_of_token tok);
   641 
   642 type state =
   643   nt_tag * int *    (*identification and production precedence*)
   644   parsetree list *  (*already parsed nonterminals on rhs*)
   645   symb list *       (*rest of rhs*)
   646   string *          (*name of production*)
   647   int;              (*index for previous state list*)
   648 
   649 
   650 (*Get all rhss with precedence >= min_prec*)
   651 fun get_RHS min_prec = filter (fn (_, _, prec: int) => prec >= min_prec);
   652 
   653 (*Get all rhss with precedence >= min_prec and < max_prec*)
   654 fun get_RHS' min_prec max_prec =
   655   filter (fn (_, _, prec: int) => prec >= min_prec andalso prec < max_prec);
   656 
   657 (*Make states using a list of rhss*)
   658 fun mk_states i min_prec lhs_ID rhss =
   659   let fun mk_state (rhs, id, prod_prec) = (lhs_ID, prod_prec, [], rhs, id, i);
   660   in map mk_state rhss end;
   661 
   662 (*Add parse tree to list and eliminate duplicates
   663   saving the maximum precedence*)
   664 fun conc (t: parsetree list, prec: int) [] = (NONE, [(t, prec)])
   665   | conc (t, prec) ((t', prec') :: ts) =
   666       if t = t' then
   667         (SOME prec',
   668           if prec' >= prec then (t', prec') :: ts
   669           else (t, prec) :: ts)
   670       else
   671         let val (n, ts') = conc (t, prec) ts
   672         in (n, (t', prec') :: ts') end;
   673 
   674 (*Update entry in used*)
   675 fun update_trees ((B: nt_tag, (i, ts)) :: used) (A, t) =
   676   if A = B then
   677     let val (n, ts') = conc t ts
   678     in ((A, (i, ts')) :: used, n) end
   679   else
   680     let val (used', n) = update_trees used (A, t)
   681     in ((B, (i, ts)) :: used', n) end;
   682 
   683 (*Replace entry in used*)
   684 fun update_prec (A: nt_tag, prec) used =
   685   let
   686     fun update ((hd as (B, (_, ts))) :: used, used') =
   687       if A = B
   688       then used' @ ((A, (prec, ts)) :: used)
   689       else update (used, hd :: used')
   690   in update (used, []) end;
   691 
   692 fun getS A max_prec Si =
   693   filter
   694     (fn (_, _, _, Nonterminal (B, prec) :: _, _, _) => A = B andalso prec <= max_prec
   695       | _ => false) Si;
   696 
   697 fun getS' A max_prec min_prec Si =
   698   filter
   699     (fn (_, _, _, Nonterminal (B, prec) :: _, _, _)
   700           => A = B andalso prec > min_prec andalso prec <= max_prec
   701       | _ => false) Si;
   702 
   703 fun get_states Estate i ii A max_prec =
   704   filter
   705     (fn (_, _, _, Nonterminal (B, prec) :: _, _, _) => A = B andalso prec <= max_prec
   706       | _ => false)
   707     (Array.sub (Estate, ii));
   708 
   709 
   710 fun movedot_term (A, j, ts, Terminal a :: sa, id, i) c =
   711   if Lexicon.valued_token c then (A, j, ts @ [Tip c], sa, id, i)
   712   else (A, j, ts, sa, id, i);
   713 
   714 fun movedot_nonterm ts (A, j, tss, Nonterminal _ :: sa, id, i) =
   715   (A, j, tss @ ts, sa, id, i);
   716 
   717 fun movedot_lambda _ [] = []
   718   | movedot_lambda (B, j, tss, Nonterminal (A, k) :: sa, id, i) ((t, ki) :: ts) =
   719       if k <= ki then
   720         (B, j, tss @ t, sa, id, i) ::
   721           movedot_lambda (B, j, tss, Nonterminal (A, k) :: sa, id, i) ts
   722       else movedot_lambda (B, j, tss, Nonterminal (A, k) :: sa, id, i) ts;
   723 
   724 
   725 (*trigger value for warnings*)
   726 val branching_level = Config.int (Config.declare "syntax_branching_level" (fn _ => Config.Int 600));
   727 
   728 (*get all productions of a NT and NTs chained to it which can
   729   be started by specified token*)
   730 fun prods_for prods chains include_none tk nts =
   731   let
   732     fun token_assoc (list, key) =
   733       let
   734         fun assoc [] result = result
   735           | assoc ((keyi, pi) :: pairs) result =
   736               if is_some keyi andalso Lexicon.matching_tokens (the keyi, key)
   737                  orelse include_none andalso is_none keyi then
   738                 assoc pairs (pi @ result)
   739               else assoc pairs result;
   740       in assoc list [] end;
   741 
   742     fun get_prods [] result = result
   743       | get_prods (nt :: nts) result =
   744           let val nt_prods = snd (Vector.sub (prods, nt));
   745           in get_prods nts (token_assoc (nt_prods, tk) @ result) end;
   746   in get_prods (connected_with chains nts nts) [] end;
   747 
   748 
   749 fun PROCESSS ctxt warned prods chains Estate i c states =
   750   let
   751     fun all_prods_for nt = prods_for prods chains true c [nt];
   752 
   753     fun processS used [] (Si, Sii) = (Si, Sii)
   754       | processS used (S :: States) (Si, Sii) =
   755           (case S of
   756             (_, _, _, Nonterminal (nt, min_prec) :: _, _, _) =>
   757               let (*predictor operation*)
   758                 val (used', new_states) =
   759                   (case AList.lookup (op =) used nt of
   760                     SOME (used_prec, l) => (*nonterminal has been processed*)
   761                       if used_prec <= min_prec then
   762                         (*wanted precedence has been processed*)
   763                         (used, movedot_lambda S l)
   764                       else (*wanted precedence hasn't been parsed yet*)
   765                         let
   766                           val tk_prods = all_prods_for nt;
   767                           val States' =
   768                             mk_states i min_prec nt (get_RHS' min_prec used_prec tk_prods);
   769                         in (update_prec (nt, min_prec) used, movedot_lambda S l @ States') end
   770                   | NONE => (*nonterminal is parsed for the first time*)
   771                       let
   772                         val tk_prods = all_prods_for nt;
   773                         val States' = mk_states i min_prec nt (get_RHS min_prec tk_prods);
   774                       in ((nt, (min_prec, [])) :: used, States') end);
   775 
   776                 val _ =
   777                   if not (! warned) andalso
   778                      length new_states + length States > Config.get ctxt branching_level then
   779                     (Context_Position.if_visible ctxt warning
   780                       "Currently parsed expression could be extremely ambiguous";
   781                      warned := true)
   782                   else ();
   783               in
   784                 processS used' (new_states @ States) (S :: Si, Sii)
   785               end
   786           | (_, _, _, Terminal a :: _, _, _) => (*scanner operation*)
   787               processS used States
   788                 (S :: Si,
   789                   if Lexicon.matching_tokens (a, c) then movedot_term S c :: Sii else Sii)
   790           | (A, prec, ts, [], id, j) => (*completer operation*)
   791               let val tt = if id = "" then ts else [Node (id, ts)] in
   792                 if j = i then (*lambda production?*)
   793                   let
   794                     val (used', O) = update_trees used (A, (tt, prec));
   795                   in
   796                     (case O of
   797                       NONE =>
   798                         let
   799                           val Slist = getS A prec Si;
   800                           val States' = map (movedot_nonterm tt) Slist;
   801                         in processS used' (States' @ States) (S :: Si, Sii) end
   802                     | SOME n =>
   803                         if n >= prec then processS used' States (S :: Si, Sii)
   804                         else
   805                           let
   806                             val Slist = getS' A prec n Si;
   807                             val States' = map (movedot_nonterm tt) Slist;
   808                           in processS used' (States' @ States) (S :: Si, Sii) end)
   809                   end
   810                 else
   811                   let val Slist = get_states Estate i j A prec
   812                   in processS used (map (movedot_nonterm tt) Slist @ States) (S :: Si, Sii) end
   813               end)
   814   in processS [] states ([], []) end;
   815 
   816 
   817 fun produce ctxt warned prods tags chains stateset i indata prev_token =
   818   (case Array.sub (stateset, i) of
   819     [] =>
   820       let
   821         val toks = if Lexicon.is_eof prev_token then indata else prev_token :: indata;
   822         val pos = Position.str_of (Lexicon.pos_of_token prev_token);
   823       in
   824         if null toks then error ("Inner syntax error: unexpected end of input" ^ pos)
   825         else error (Pretty.string_of (Pretty.block
   826           (Pretty.str ("Inner syntax error" ^ pos ^ " at \"") ::
   827             Pretty.breaks (map (Pretty.str o Lexicon.str_of_token) (#1 (split_last toks))) @
   828             [Pretty.str "\""])))
   829       end
   830   | s =>
   831       (case indata of
   832         [] => s
   833       | c :: cs =>
   834           let
   835             val (si, sii) = PROCESSS ctxt warned prods chains stateset i c s;
   836             val _ = Array.update (stateset, i, si);
   837             val _ = Array.update (stateset, i + 1, sii);
   838           in produce ctxt warned prods tags chains stateset (i + 1) cs c end));
   839 
   840 
   841 fun get_trees states = map_filter (fn (_, _, [pt], _, _, _) => SOME pt | _ => NONE) states;
   842 
   843 fun earley ctxt prods tags chains startsymbol indata =
   844   let
   845     val start_tag =
   846       (case Symtab.lookup tags startsymbol of
   847         SOME tag => tag
   848       | NONE => error ("Inner syntax: bad grammar root symbol " ^ quote startsymbol));
   849     val S0 = [(~1, 0, [], [Nonterminal (start_tag, 0), Terminal Lexicon.eof], "", 0)];
   850     val s = length indata + 1;
   851     val Estate = Array.array (s, []);
   852     val _ = Array.update (Estate, 0, S0);
   853   in
   854     get_trees
   855       (produce ctxt (Unsynchronized.ref false) prods tags chains Estate 0 indata Lexicon.eof)
   856   end;
   857 
   858 
   859 fun parse ctxt (Gram {tags, prods, chains, ...}) start toks =
   860   let
   861     val end_pos =
   862       (case try List.last toks of
   863         NONE => Position.none
   864       | SOME (Lexicon.Token (_, _, (_, end_pos))) => end_pos);
   865     val r =
   866       (case earley ctxt prods tags chains start (toks @ [Lexicon.mk_eof end_pos]) of
   867         [] => raise Fail "Inner syntax: no parse trees"
   868       | pts => pts);
   869   in r end;
   870 
   871 
   872 fun guess_infix_lr (Gram gram) c = (*based on educated guess*)
   873   let
   874     fun freeze a = map_range (curry Vector.sub a) (Vector.length a);
   875     val prods = maps snd (maps snd (freeze (#prods gram)));
   876     fun guess (SOME ([Nonterminal (_, k),
   877             Terminal (Lexicon.Token (Lexicon.Literal, s, _)), Nonterminal (_, l)], _, j)) =
   878           if k = j andalso l = j + 1 then SOME (s, true, false, j)
   879           else if k = j + 1 then if l = j then SOME (s, false, true, j)
   880             else if l = j + 1 then SOME (s, false, false, j)
   881             else NONE
   882           else NONE
   883       | guess _ = NONE;
   884   in guess (find_first (fn (_, s, _) => s = c) prods) end;
   885 
   886 end;