wenzelm@18
|
1 |
(* Title: Pure/Syntax/parser.ML
|
wenzelm@46
|
2 |
ID: $Id$
|
clasohm@345
|
3 |
Author: Sonia Mahjoub, Markus Wenzel and Carsten Clasohm, TU Muenchen
|
wenzelm@18
|
4 |
|
wenzelm@552
|
5 |
Isabelle's main parser (used for terms and types).
|
wenzelm@18
|
6 |
*)
|
wenzelm@18
|
7 |
|
wenzelm@18
|
8 |
signature PARSER =
|
wenzelm@18
|
9 |
sig
|
wenzelm@237
|
10 |
structure Lexicon: LEXICON
|
wenzelm@237
|
11 |
structure SynExt: SYN_EXT
|
wenzelm@237
|
12 |
local open Lexicon SynExt SynExt.Ast in
|
wenzelm@18
|
13 |
type gram
|
wenzelm@18
|
14 |
val empty_gram: gram
|
clasohm@624
|
15 |
val extend_gram: gram -> string list -> xprod list -> gram
|
wenzelm@237
|
16 |
val merge_grams: gram -> gram -> gram
|
wenzelm@237
|
17 |
val pretty_gram: gram -> Pretty.T list
|
wenzelm@237
|
18 |
datatype parsetree =
|
wenzelm@237
|
19 |
Node of string * parsetree list |
|
wenzelm@237
|
20 |
Tip of token
|
clasohm@330
|
21 |
val parse: gram -> string -> token list -> parsetree list
|
wenzelm@18
|
22 |
end
|
wenzelm@18
|
23 |
end;
|
wenzelm@18
|
24 |
|
wenzelm@237
|
25 |
functor ParserFun(structure Symtab: SYMTAB and Lexicon: LEXICON
|
wenzelm@552
|
26 |
and SynExt: SYN_EXT): PARSER =
|
wenzelm@18
|
27 |
struct
|
wenzelm@18
|
28 |
|
wenzelm@237
|
29 |
structure Pretty = SynExt.Ast.Pretty;
|
wenzelm@237
|
30 |
structure Lexicon = Lexicon;
|
wenzelm@237
|
31 |
structure SynExt = SynExt;
|
wenzelm@237
|
32 |
open Lexicon SynExt;
|
wenzelm@18
|
33 |
|
wenzelm@18
|
34 |
|
wenzelm@18
|
35 |
(** datatype gram **)
|
wenzelm@18
|
36 |
|
wenzelm@237
|
37 |
datatype symb =
|
wenzelm@237
|
38 |
Terminal of token |
|
wenzelm@237
|
39 |
Nonterminal of string * int;
|
wenzelm@18
|
40 |
|
clasohm@330
|
41 |
datatype refsymb = Term of token | Nonterm of rhss_ref * int
|
clasohm@330
|
42 |
(*reference to production list instead of name*)
|
clasohm@330
|
43 |
and gram = Gram of (string * (symb list * string * int)) list *
|
clasohm@330
|
44 |
(string * rhss_ref) list
|
clasohm@330
|
45 |
withtype rhss_ref = (token option * (refsymb list * string * int) list) list ref
|
clasohm@330
|
46 |
(*lookahead table: token and productions*)
|
wenzelm@18
|
47 |
|
clasohm@330
|
48 |
(* convert productions to reference grammar with lookaheads and eliminate
|
clasohm@330
|
49 |
chain productions *)
|
clasohm@373
|
50 |
fun mk_gram prods =
|
clasohm@395
|
51 |
let (*get reference on list of all possible rhss for nonterminal lhs
|
clasohm@330
|
52 |
(if it doesn't exist a new one is created and added to the nonterminal
|
clasohm@330
|
53 |
list)*)
|
clasohm@330
|
54 |
fun get_rhss ref_prods lhs =
|
clasohm@330
|
55 |
case assoc (ref_prods, lhs) of
|
clasohm@330
|
56 |
None =>
|
clasohm@330
|
57 |
let val l = ref [(None, [])]
|
clasohm@330
|
58 |
in (l, (lhs, l) :: ref_prods) end
|
clasohm@330
|
59 |
| Some l => (l, ref_prods);
|
clasohm@330
|
60 |
|
clasohm@330
|
61 |
(*convert symb list to refsymb list*)
|
clasohm@330
|
62 |
fun mk_refsymbs ref_prods [] rs = (rs, ref_prods)
|
clasohm@330
|
63 |
| mk_refsymbs ref_prods (Terminal tk :: symbs) rs =
|
clasohm@330
|
64 |
mk_refsymbs ref_prods symbs (rs @ [Term tk])
|
clasohm@330
|
65 |
| mk_refsymbs ref_prods (Nonterminal (name, prec) :: symbs) rs =
|
clasohm@330
|
66 |
let val (rhss, ref_prods') = get_rhss ref_prods name
|
clasohm@330
|
67 |
in mk_refsymbs ref_prods' symbs (rs @ [Nonterm (rhss, prec)])
|
clasohm@330
|
68 |
end;
|
clasohm@330
|
69 |
|
clasohm@330
|
70 |
(*convert prod list to (string * rhss_ref) list
|
clasohm@330
|
71 |
without computing lookaheads*)
|
clasohm@330
|
72 |
fun mk_ref_gram [] ref_prods = ref_prods
|
clasohm@330
|
73 |
| mk_ref_gram ((lhs, (rhs, name, prec)) :: ps) ref_prods =
|
clasohm@330
|
74 |
let val (rhs', ref_prods') = get_rhss ref_prods lhs;
|
clasohm@330
|
75 |
val (dummy, rhss) = hd (!rhs');
|
clasohm@330
|
76 |
val (ref_symbs, ref_prods'') = mk_refsymbs ref_prods' rhs [];
|
clasohm@330
|
77 |
in rhs' := [(dummy, (ref_symbs, name, prec) :: rhss)];
|
clasohm@330
|
78 |
mk_ref_gram ps ref_prods''
|
clasohm@330
|
79 |
end;
|
clasohm@330
|
80 |
|
clasohm@330
|
81 |
(*eliminate chain productions*)
|
clasohm@330
|
82 |
fun elim_chain ref_gram =
|
clasohm@330
|
83 |
let (*make a list of pairs representing chain productions and delete
|
clasohm@330
|
84 |
these productions*)
|
clasohm@330
|
85 |
fun list_chain [] = []
|
clasohm@330
|
86 |
| list_chain ((_, rhss_ref) :: ps) =
|
clasohm@330
|
87 |
let fun lists [] new_rhss chains = (new_rhss, chains)
|
clasohm@330
|
88 |
| lists (([Nonterm (id2, ~1)], _, ~1) :: rs)
|
clasohm@330
|
89 |
new_rhss chains =
|
clasohm@330
|
90 |
lists rs new_rhss ((rhss_ref, id2) :: chains)
|
clasohm@330
|
91 |
| lists (rhs :: rs) new_rhss chains =
|
clasohm@330
|
92 |
lists rs (rhs :: new_rhss) chains;
|
clasohm@330
|
93 |
|
clasohm@330
|
94 |
val (dummy, rhss) = hd (!rhss_ref);
|
clasohm@330
|
95 |
|
clasohm@330
|
96 |
val (new_rhss, chains) = lists rhss [] [];
|
clasohm@330
|
97 |
in rhss_ref := [(dummy, new_rhss)];
|
clasohm@330
|
98 |
chains @ (list_chain ps)
|
clasohm@330
|
99 |
end;
|
clasohm@330
|
100 |
|
clasohm@345
|
101 |
(*convert a list of pairs to an association list
|
clasohm@345
|
102 |
by using the first element as the key*)
|
clasohm@345
|
103 |
fun mk_assoc pairs =
|
clasohm@330
|
104 |
let fun doit [] result = result
|
clasohm@330
|
105 |
| doit ((id1, id2) :: ps) result =
|
clasohm@330
|
106 |
doit ps
|
clasohm@330
|
107 |
(overwrite (result, (id1, id2 :: (assocs result id1))));
|
clasohm@345
|
108 |
in doit pairs [] end;
|
clasohm@330
|
109 |
|
clasohm@330
|
110 |
(*replace reference by list of rhss in chain pairs*)
|
clasohm@345
|
111 |
fun deref (id1, ids) =
|
clasohm@345
|
112 |
let fun deref1 [] = []
|
clasohm@345
|
113 |
| deref1 (id :: ids) =
|
clasohm@330
|
114 |
let val (_, rhss) = hd (!id);
|
clasohm@345
|
115 |
in rhss @ (deref1 ids) end;
|
clasohm@345
|
116 |
in (id1, deref1 ids) end;
|
clasohm@330
|
117 |
|
clasohm@330
|
118 |
val chain_pairs =
|
clasohm@345
|
119 |
map deref (transitive_closure (mk_assoc (list_chain ref_gram)));
|
clasohm@330
|
120 |
|
clasohm@330
|
121 |
(*add new rhss to productions*)
|
clasohm@345
|
122 |
fun elim (rhss_ref, rhss) =
|
clasohm@330
|
123 |
let val (dummy, old_rhss) = hd (!rhss_ref);
|
clasohm@330
|
124 |
in rhss_ref := [(dummy, old_rhss @ rhss)] end;
|
clasohm@345
|
125 |
in map elim chain_pairs;
|
clasohm@330
|
126 |
ref_gram
|
clasohm@330
|
127 |
end;
|
clasohm@330
|
128 |
|
clasohm@330
|
129 |
val ref_gram = elim_chain (mk_ref_gram prods []);
|
clasohm@330
|
130 |
|
clasohm@330
|
131 |
(*make a list of all lambda NTs
|
clasohm@330
|
132 |
(i.e. nonterminals that can produce lambda*)
|
clasohm@330
|
133 |
val lambdas =
|
clasohm@330
|
134 |
let fun lambda [] result = result
|
clasohm@330
|
135 |
| lambda ((_, rhss_ref) :: nts) result =
|
clasohm@330
|
136 |
if rhss_ref mem result then
|
clasohm@330
|
137 |
lambda nts result
|
clasohm@330
|
138 |
else
|
clasohm@345
|
139 |
let (*list all NTs that can be produced by a rhs
|
clasohm@345
|
140 |
containing only lambda NTs*)
|
clasohm@345
|
141 |
fun only_lambdas [] result = result
|
clasohm@345
|
142 |
| only_lambdas ((_, rhss_ref) :: ps) result =
|
clasohm@330
|
143 |
let fun only (symbs, _, _) =
|
clasohm@330
|
144 |
forall (fn (Nonterm (id, _)) => id mem result
|
clasohm@330
|
145 |
| (Term _) => false) symbs;
|
clasohm@330
|
146 |
|
clasohm@330
|
147 |
val (_, rhss) = hd (!rhss_ref);
|
clasohm@330
|
148 |
in if not (rhss_ref mem result) andalso
|
clasohm@330
|
149 |
exists only rhss then
|
clasohm@345
|
150 |
only_lambdas ref_gram (rhss_ref :: result)
|
clasohm@330
|
151 |
else
|
clasohm@345
|
152 |
only_lambdas ps result
|
clasohm@330
|
153 |
end;
|
clasohm@330
|
154 |
|
clasohm@330
|
155 |
val (_, rhss) = hd (!rhss_ref);
|
clasohm@330
|
156 |
in if exists (fn (symbs, _, _) => null symbs) rhss
|
clasohm@330
|
157 |
then lambda nts
|
clasohm@345
|
158 |
(only_lambdas ref_gram (rhss_ref :: result))
|
clasohm@330
|
159 |
else lambda nts result
|
clasohm@330
|
160 |
end;
|
clasohm@330
|
161 |
in lambda ref_gram [] end;
|
clasohm@330
|
162 |
|
clasohm@330
|
163 |
(*list all nonterminals on which the lookahead depends (due to lambda
|
clasohm@330
|
164 |
NTs this can be more than one)
|
clasohm@330
|
165 |
and report if there is a terminal at the 'start'*)
|
clasohm@330
|
166 |
fun rhss_start [] skipped = (None, skipped)
|
clasohm@330
|
167 |
| rhss_start (Term tk :: _) skipped = (Some tk, skipped)
|
clasohm@330
|
168 |
| rhss_start (Nonterm (rhss_ref, _) :: rest) skipped =
|
clasohm@330
|
169 |
if rhss_ref mem lambdas then
|
clasohm@330
|
170 |
rhss_start rest (rhss_ref ins skipped)
|
clasohm@330
|
171 |
else
|
clasohm@330
|
172 |
(None, rhss_ref ins skipped);
|
clasohm@330
|
173 |
|
clasohm@330
|
174 |
(*list all terminals that can start the given rhss*)
|
clasohm@330
|
175 |
fun look_rhss starts rhss_ref =
|
clasohm@377
|
176 |
let fun look [] _ result = result
|
clasohm@377
|
177 |
| look ((symbs, _, _) :: todos) done result =
|
clasohm@330
|
178 |
let val (start_token, skipped) = rhss_start symbs [];
|
clasohm@330
|
179 |
|
clasohm@330
|
180 |
(*process all nonterminals on which the lookahead
|
clasohm@330
|
181 |
depends and build the new todo and done lists for
|
clasohm@330
|
182 |
the look function*)
|
clasohm@377
|
183 |
fun look2 [] todos result =
|
clasohm@377
|
184 |
look todos (done union skipped) result
|
clasohm@377
|
185 |
| look2 (rhss_ref :: ls) todos result =
|
clasohm@377
|
186 |
if rhss_ref mem done then look2 ls todos result
|
clasohm@330
|
187 |
else case assoc (starts, rhss_ref) of
|
clasohm@377
|
188 |
Some tks => look2 ls todos (tks union result)
|
clasohm@377
|
189 |
| None =>
|
clasohm@377
|
190 |
let val (_, rhss) = hd (!rhss_ref);
|
clasohm@377
|
191 |
in look2 ls (rhss @ todos) result end;
|
clasohm@330
|
192 |
in case start_token of
|
clasohm@377
|
193 |
Some tk => look2 skipped todos (start_token ins result)
|
clasohm@377
|
194 |
| None => look2 skipped todos result
|
clasohm@330
|
195 |
end;
|
clasohm@330
|
196 |
|
clasohm@330
|
197 |
val (_, rhss) = hd (!rhss_ref);
|
clasohm@377
|
198 |
in look rhss [rhss_ref] [] end;
|
clasohm@330
|
199 |
|
clasohm@330
|
200 |
(*make a table that contains all possible starting terminals
|
clasohm@330
|
201 |
for each nonterminal*)
|
clasohm@330
|
202 |
fun mk_starts [] starts = starts
|
clasohm@330
|
203 |
| mk_starts ((_, rhss_ref) :: ps) starts =
|
clasohm@330
|
204 |
mk_starts ps ((rhss_ref, look_rhss starts rhss_ref) :: starts);
|
clasohm@330
|
205 |
|
clasohm@330
|
206 |
val starts = mk_starts ref_gram [];
|
clasohm@330
|
207 |
|
clasohm@330
|
208 |
(*add list of allowed starting tokens to productions*)
|
clasohm@330
|
209 |
fun mk_lookahead (_, rhss_ref) =
|
clasohm@377
|
210 |
let (*compares two values of type token option
|
clasohm@377
|
211 |
(used for speed reasons)*)
|
clasohm@377
|
212 |
fun matching_opt_tks (Some tk1, Some tk2) =
|
clasohm@377
|
213 |
matching_tokens (tk1, tk2)
|
clasohm@377
|
214 |
| matching_opt_tks _ = false;
|
clasohm@377
|
215 |
|
clasohm@377
|
216 |
(*add item to lookahead list (a list containing pairs of token and
|
clasohm@377
|
217 |
rhss that can be started with it)*)
|
clasohm@345
|
218 |
fun add_start new_rhs tokens table =
|
clasohm@345
|
219 |
let fun add [] [] = []
|
clasohm@345
|
220 |
| add (tk :: tks) [] =
|
clasohm@345
|
221 |
(tk, [new_rhs]) :: (add tks [])
|
clasohm@345
|
222 |
| add tokens ((tk, rhss) :: ss) =
|
clasohm@377
|
223 |
if gen_mem matching_opt_tks (tk, tokens) then
|
clasohm@345
|
224 |
(tk, new_rhs :: rhss) :: (add (tokens \ tk) ss)
|
clasohm@330
|
225 |
else
|
clasohm@345
|
226 |
(tk, rhss) :: (add tokens ss);
|
clasohm@345
|
227 |
in add tokens table end;
|
clasohm@330
|
228 |
|
clasohm@330
|
229 |
(*combine all lookaheads of a list of nonterminals*)
|
clasohm@345
|
230 |
fun combine_starts rhss_refs =
|
clasohm@377
|
231 |
foldr (gen_union matching_opt_tks)
|
clasohm@345
|
232 |
((map (fn rhss_ref => let val Some tks = assoc (starts, rhss_ref)
|
clasohm@345
|
233 |
in tks end) rhss_refs), []);
|
clasohm@330
|
234 |
|
clasohm@330
|
235 |
(*get lookahead for a rhs and update lhs' lookahead list*)
|
clasohm@330
|
236 |
fun look_rhss [] table = table
|
clasohm@330
|
237 |
| look_rhss ((rhs as (symbs, id, prec)) :: rs) table =
|
clasohm@330
|
238 |
let val (start_token, skipped) = rhss_start symbs [];
|
clasohm@330
|
239 |
val starts = case start_token of
|
clasohm@377
|
240 |
Some tk => gen_ins matching_opt_tks
|
clasohm@377
|
241 |
(Some tk, combine_starts skipped)
|
clasohm@345
|
242 |
| None => if skipped subset lambdas then
|
clasohm@330
|
243 |
[None]
|
clasohm@330
|
244 |
else
|
clasohm@330
|
245 |
combine_starts skipped;
|
clasohm@330
|
246 |
in look_rhss rs (add_start rhs starts table) end;
|
clasohm@330
|
247 |
|
clasohm@330
|
248 |
val (_, rhss) = hd (!rhss_ref);
|
clasohm@330
|
249 |
in rhss_ref := look_rhss rhss [] end;
|
clasohm@330
|
250 |
|
clasohm@330
|
251 |
in map mk_lookahead ref_gram;
|
clasohm@330
|
252 |
Gram (prods, ref_gram)
|
clasohm@330
|
253 |
end;
|
wenzelm@18
|
254 |
|
wenzelm@18
|
255 |
|
wenzelm@237
|
256 |
(* empty, extend, merge grams *)
|
wenzelm@18
|
257 |
|
wenzelm@237
|
258 |
val empty_gram = mk_gram [];
|
wenzelm@18
|
259 |
|
clasohm@624
|
260 |
fun extend_gram (gram1 as Gram (prods1, _)) roots xprods2 =
|
wenzelm@18
|
261 |
let
|
clasohm@624
|
262 |
fun simplify preserve s =
|
clasohm@624
|
263 |
if preserve then
|
clasohm@624
|
264 |
(if s mem (roots \\ ["type", "prop", "any"]) then "logic" else s)
|
clasohm@624
|
265 |
else (if s = "prop" then "@prop_H" else
|
clasohm@624
|
266 |
(if s mem (roots \\ ["type", "prop", "any"]) then
|
clasohm@624
|
267 |
"@logic_H"
|
clasohm@624
|
268 |
else s));
|
clasohm@624
|
269 |
|
clasohm@624
|
270 |
fun not_delim (Delim _) = false
|
clasohm@624
|
271 |
| not_delim _ = true
|
clasohm@624
|
272 |
|
clasohm@624
|
273 |
fun symb_of _ (Delim s) = Some (Terminal (Token s))
|
clasohm@624
|
274 |
| symb_of preserve (Argument (s, p)) =
|
wenzelm@18
|
275 |
(case predef_term s of
|
clasohm@624
|
276 |
None => Some (Nonterminal (simplify preserve s, p))
|
wenzelm@237
|
277 |
| Some tk => Some (Terminal tk))
|
clasohm@624
|
278 |
| symb_of _ _ = None;
|
wenzelm@18
|
279 |
|
wenzelm@237
|
280 |
fun prod_of (XProd (lhs, xsymbs, const, pri)) =
|
clasohm@624
|
281 |
let val copy_prod = (lhs = "prop" andalso forall not_delim xsymbs andalso
|
clasohm@624
|
282 |
const <> "");
|
clasohm@624
|
283 |
|
clasohm@624
|
284 |
val preserve = copy_prod
|
clasohm@624
|
285 |
orelse pri = chain_pri andalso lhs = "logic"
|
clasohm@624
|
286 |
orelse lhs mem ["@prop_H", "@logic_H", "any"];
|
clasohm@624
|
287 |
|
clasohm@624
|
288 |
val lhs' = if copy_prod then "@prop_H" else
|
clasohm@624
|
289 |
if lhs = "logic" andalso pri = chain_pri
|
clasohm@624
|
290 |
then "@logic_H"
|
clasohm@624
|
291 |
else if lhs mem ("logic1" :: (roots \\ ["type", "prop"]))
|
clasohm@624
|
292 |
then "logic"
|
clasohm@624
|
293 |
else lhs;
|
clasohm@624
|
294 |
in (lhs', (mapfilter (symb_of preserve) xsymbs, const, pri))
|
clasohm@624
|
295 |
end;
|
wenzelm@18
|
296 |
|
wenzelm@237
|
297 |
val prods2 = distinct (map prod_of xprods2);
|
wenzelm@18
|
298 |
in
|
wenzelm@237
|
299 |
if prods2 subset prods1 then gram1
|
clasohm@395
|
300 |
else (writeln "Building new grammar...";
|
clasohm@395
|
301 |
mk_gram (extend_list prods1 prods2))
|
wenzelm@18
|
302 |
end;
|
wenzelm@18
|
303 |
|
wenzelm@237
|
304 |
fun merge_grams (gram1 as Gram (prods1, _)) (gram2 as Gram (prods2, _)) =
|
wenzelm@237
|
305 |
if prods2 subset prods1 then gram1
|
wenzelm@237
|
306 |
else if prods1 subset prods2 then gram2
|
clasohm@395
|
307 |
else (writeln "Building new grammar...";
|
clasohm@395
|
308 |
mk_gram (merge_lists prods1 prods2));
|
wenzelm@18
|
309 |
|
wenzelm@18
|
310 |
|
wenzelm@237
|
311 |
(* pretty_gram *)
|
wenzelm@18
|
312 |
|
wenzelm@237
|
313 |
fun pretty_gram (Gram (prods, _)) =
|
wenzelm@237
|
314 |
let
|
wenzelm@237
|
315 |
fun pretty_name name = [Pretty.str (name ^ " =")];
|
wenzelm@18
|
316 |
|
wenzelm@237
|
317 |
fun pretty_symb (Terminal (Token s)) = Pretty.str (quote s)
|
wenzelm@237
|
318 |
| pretty_symb (Terminal tok) = Pretty.str (str_of_token tok)
|
wenzelm@237
|
319 |
| pretty_symb (Nonterminal (s, p)) =
|
wenzelm@237
|
320 |
Pretty.str (s ^ "[" ^ string_of_int p ^ "]");
|
wenzelm@18
|
321 |
|
wenzelm@237
|
322 |
fun pretty_const "" = []
|
wenzelm@237
|
323 |
| pretty_const c = [Pretty.str ("=> " ^ quote c)];
|
wenzelm@237
|
324 |
|
wenzelm@237
|
325 |
fun pretty_pri p = [Pretty.str ("(" ^ string_of_int p ^ ")")];
|
wenzelm@237
|
326 |
|
wenzelm@237
|
327 |
fun pretty_prod (name, (symbs, const, pri)) =
|
wenzelm@237
|
328 |
Pretty.block (Pretty.breaks (pretty_name name @
|
wenzelm@237
|
329 |
map pretty_symb symbs @ pretty_const const @ pretty_pri pri));
|
wenzelm@18
|
330 |
in
|
wenzelm@237
|
331 |
map pretty_prod prods
|
wenzelm@18
|
332 |
end;
|
wenzelm@18
|
333 |
|
wenzelm@18
|
334 |
|
wenzelm@18
|
335 |
|
wenzelm@18
|
336 |
(** parse **)
|
wenzelm@18
|
337 |
|
wenzelm@237
|
338 |
datatype parsetree =
|
wenzelm@237
|
339 |
Node of string * parsetree list |
|
wenzelm@237
|
340 |
Tip of token;
|
wenzelm@237
|
341 |
|
wenzelm@18
|
342 |
type state =
|
clasohm@330
|
343 |
rhss_ref * int (*lhs: identification and production precedence*)
|
clasohm@330
|
344 |
* parsetree list (*already parsed nonterminals on rhs*)
|
clasohm@330
|
345 |
* refsymb list (*rest of rhs*)
|
clasohm@330
|
346 |
* string (*name of production*)
|
clasohm@330
|
347 |
* int; (*index for previous state list*)
|
wenzelm@18
|
348 |
|
wenzelm@18
|
349 |
type earleystate = state list Array.array;
|
wenzelm@18
|
350 |
|
wenzelm@18
|
351 |
|
clasohm@330
|
352 |
(*Get all rhss with precedence >= minPrec*)
|
clasohm@330
|
353 |
fun getRHS minPrec = filter (fn (_, _, prec:int) => prec >= minPrec);
|
wenzelm@18
|
354 |
|
clasohm@330
|
355 |
(*Get all rhss with precedence >= minPrec and < maxPrec*)
|
clasohm@330
|
356 |
fun getRHS' minPrec maxPrec =
|
clasohm@330
|
357 |
filter (fn (_, _, prec:int) => prec >= minPrec andalso prec < maxPrec);
|
wenzelm@237
|
358 |
|
clasohm@330
|
359 |
(*Make states using a list of rhss*)
|
clasohm@330
|
360 |
fun mkStates i minPrec lhsID rhss =
|
clasohm@330
|
361 |
let fun mkState (rhs, id, prodPrec) = (lhsID, prodPrec, [], rhs, id, i);
|
clasohm@330
|
362 |
in map mkState rhss end;
|
clasohm@330
|
363 |
|
clasohm@330
|
364 |
(*Add parse tree to list and eliminate duplicates
|
clasohm@330
|
365 |
saving the maximum precedence*)
|
clasohm@330
|
366 |
fun conc (t, prec:int) [] = (None, [(t, prec)])
|
clasohm@330
|
367 |
| conc (t, prec) ((t', prec') :: ts) =
|
clasohm@330
|
368 |
if t = t' then
|
clasohm@330
|
369 |
(Some prec', if prec' >= prec then (t', prec') :: ts
|
clasohm@330
|
370 |
else (t, prec) :: ts)
|
clasohm@330
|
371 |
else
|
clasohm@330
|
372 |
let val (n, ts') = conc (t, prec) ts
|
clasohm@330
|
373 |
in (n, (t', prec') :: ts') end;
|
wenzelm@18
|
374 |
|
clasohm@330
|
375 |
(*Update entry in used*)
|
wenzelm@237
|
376 |
fun update_tree ((B, (i, ts)) :: used) (A, t) =
|
wenzelm@237
|
377 |
if A = B then
|
wenzelm@237
|
378 |
let val (n, ts') = conc t ts
|
wenzelm@237
|
379 |
in ((A, (i, ts')) :: used, n) end
|
wenzelm@237
|
380 |
else
|
wenzelm@237
|
381 |
let val (used', n) = update_tree used (A, t)
|
wenzelm@237
|
382 |
in ((B, (i, ts)) :: used', n) end;
|
wenzelm@18
|
383 |
|
clasohm@330
|
384 |
(*Replace entry in used*)
|
clasohm@330
|
385 |
fun update_index (A, prec) used =
|
clasohm@330
|
386 |
let fun update((hd as (B, (_, ts))) :: used, used') =
|
clasohm@330
|
387 |
if A = B
|
clasohm@330
|
388 |
then used' @ ((A, (prec, ts)) :: used)
|
clasohm@330
|
389 |
else update (used, hd :: used')
|
clasohm@330
|
390 |
in update (used, []) end;
|
wenzelm@18
|
391 |
|
clasohm@330
|
392 |
fun getS A maxPrec Si =
|
wenzelm@237
|
393 |
filter
|
clasohm@330
|
394 |
(fn (_, _, _, Nonterm (B, prec) :: _, _, _)
|
clasohm@330
|
395 |
=> A = B andalso prec <= maxPrec
|
wenzelm@237
|
396 |
| _ => false) Si;
|
wenzelm@18
|
397 |
|
clasohm@330
|
398 |
fun getS' A maxPrec minPrec Si =
|
wenzelm@237
|
399 |
filter
|
clasohm@330
|
400 |
(fn (_, _, _, Nonterm (B, prec) :: _, _, _)
|
clasohm@330
|
401 |
=> A = B andalso prec > minPrec andalso prec <= maxPrec
|
wenzelm@237
|
402 |
| _ => false) Si;
|
wenzelm@18
|
403 |
|
clasohm@330
|
404 |
fun getStates Estate i ii A maxPrec =
|
wenzelm@237
|
405 |
filter
|
clasohm@330
|
406 |
(fn (_, _, _, Nonterm (B, prec) :: _, _, _)
|
clasohm@330
|
407 |
=> A = B andalso prec <= maxPrec
|
wenzelm@237
|
408 |
| _ => false)
|
wenzelm@237
|
409 |
(Array.sub (Estate, ii));
|
wenzelm@18
|
410 |
|
wenzelm@18
|
411 |
|
clasohm@330
|
412 |
fun movedot_term (A, j, ts, Term a :: sa, id, i) c =
|
wenzelm@237
|
413 |
if valued_token c then
|
wenzelm@237
|
414 |
(A, j, (ts @ [Tip c]), sa, id, i)
|
wenzelm@237
|
415 |
else (A, j, ts, sa, id, i);
|
wenzelm@18
|
416 |
|
clasohm@330
|
417 |
fun movedot_nonterm ts (A, j, tss, Nonterm _ :: sa, id, i) =
|
wenzelm@237
|
418 |
(A, j, tss @ ts, sa, id, i);
|
wenzelm@18
|
419 |
|
wenzelm@237
|
420 |
fun movedot_lambda _ [] = []
|
clasohm@330
|
421 |
| movedot_lambda (B, j, tss, Nonterm (A, k) :: sa, id, i) ((t, ki) :: ts) =
|
wenzelm@237
|
422 |
if k <= ki then
|
wenzelm@237
|
423 |
(B, j, tss @ t, sa, id, i) ::
|
clasohm@330
|
424 |
movedot_lambda (B, j, tss, Nonterm (A, k) :: sa, id, i) ts
|
clasohm@330
|
425 |
else movedot_lambda (B, j, tss, Nonterm (A, k) :: sa, id, i) ts;
|
wenzelm@18
|
426 |
|
wenzelm@18
|
427 |
|
wenzelm@18
|
428 |
|
clasohm@330
|
429 |
fun PROCESSS Estate i c states =
|
wenzelm@18
|
430 |
let
|
clasohm@330
|
431 |
fun get_lookahead rhss_ref = token_assoc (!rhss_ref, c);
|
clasohm@330
|
432 |
|
wenzelm@237
|
433 |
fun processS used [] (Si, Sii) = (Si, Sii)
|
wenzelm@237
|
434 |
| processS used (S :: States) (Si, Sii) =
|
wenzelm@237
|
435 |
(case S of
|
clasohm@330
|
436 |
(_, _, _, Nonterm (rhss_ref, minPrec) :: _, _, _) =>
|
clasohm@330
|
437 |
let (*predictor operation*)
|
clasohm@330
|
438 |
val (used_new, States_new) =
|
clasohm@330
|
439 |
(case assoc (used, rhss_ref) of
|
clasohm@330
|
440 |
Some (usedPrec, l) => (*nonterminal has been processed*)
|
clasohm@330
|
441 |
if usedPrec <= minPrec then
|
clasohm@330
|
442 |
(*wanted precedence has been processed*)
|
clasohm@330
|
443 |
(used, movedot_lambda S l)
|
clasohm@330
|
444 |
else (*wanted precedence hasn't been parsed yet*)
|
clasohm@330
|
445 |
let val rhss = get_lookahead rhss_ref;
|
clasohm@330
|
446 |
val States' = mkStates i minPrec rhss_ref
|
clasohm@330
|
447 |
(getRHS' minPrec usedPrec rhss);
|
clasohm@330
|
448 |
in (update_index (rhss_ref, minPrec) used,
|
clasohm@330
|
449 |
movedot_lambda S l @ States')
|
wenzelm@237
|
450 |
end
|
wenzelm@18
|
451 |
|
clasohm@330
|
452 |
| None => (*nonterminal is parsed for the first time*)
|
clasohm@330
|
453 |
let val rhss = get_lookahead rhss_ref;
|
clasohm@330
|
454 |
val States' = mkStates i minPrec rhss_ref
|
clasohm@330
|
455 |
(getRHS minPrec rhss);
|
clasohm@330
|
456 |
in ((rhss_ref, (minPrec, [])) :: used, States') end)
|
wenzelm@237
|
457 |
in
|
clasohm@330
|
458 |
processS used_new (States_new @ States) (S :: Si, Sii)
|
wenzelm@237
|
459 |
end
|
clasohm@330
|
460 |
| (_, _, _, Term a :: _, _, _) => (*scanner operation*)
|
wenzelm@237
|
461 |
processS used States
|
wenzelm@237
|
462 |
(S :: Si,
|
wenzelm@237
|
463 |
if matching_tokens (a, c) then movedot_term S c :: Sii else Sii)
|
wenzelm@18
|
464 |
|
clasohm@330
|
465 |
| (A, prec, ts, [], id, j) => (*completer operation*)
|
wenzelm@237
|
466 |
let
|
wenzelm@237
|
467 |
val tt = if id = "" then ts else [Node (id, ts)]
|
wenzelm@237
|
468 |
in
|
clasohm@330
|
469 |
if j = i then (*lambda production?*)
|
wenzelm@237
|
470 |
let
|
clasohm@330
|
471 |
val (used', O) = update_tree used (A, (tt, prec));
|
wenzelm@237
|
472 |
in
|
wenzelm@237
|
473 |
(case O of
|
wenzelm@237
|
474 |
None =>
|
wenzelm@237
|
475 |
let
|
clasohm@330
|
476 |
val Slist = getS A prec Si;
|
wenzelm@237
|
477 |
val States' = map (movedot_nonterm tt) Slist;
|
wenzelm@237
|
478 |
in
|
clasohm@330
|
479 |
processS used' (States' @ States) (S :: Si, Sii)
|
wenzelm@237
|
480 |
end
|
wenzelm@237
|
481 |
| Some n =>
|
clasohm@330
|
482 |
if n >= prec then
|
wenzelm@237
|
483 |
processS used' States (S :: Si, Sii)
|
wenzelm@237
|
484 |
else
|
wenzelm@237
|
485 |
let
|
clasohm@330
|
486 |
val Slist = getS' A prec n Si;
|
wenzelm@237
|
487 |
val States' = map (movedot_nonterm tt) Slist;
|
wenzelm@18
|
488 |
in
|
clasohm@330
|
489 |
processS used' (States' @ States) (S :: Si, Sii)
|
wenzelm@237
|
490 |
end)
|
clasohm@330
|
491 |
end
|
wenzelm@237
|
492 |
else
|
wenzelm@237
|
493 |
processS used
|
clasohm@330
|
494 |
(map (movedot_nonterm tt) (getStates Estate i j A prec) @ States)
|
wenzelm@237
|
495 |
(S :: Si, Sii)
|
wenzelm@237
|
496 |
end)
|
wenzelm@18
|
497 |
in
|
wenzelm@237
|
498 |
processS [] states ([], [])
|
wenzelm@18
|
499 |
end;
|
wenzelm@18
|
500 |
|
wenzelm@18
|
501 |
|
wenzelm@18
|
502 |
|
clasohm@362
|
503 |
fun syntax_error toks allowed =
|
clasohm@362
|
504 |
error
|
clasohm@362
|
505 |
((if toks = [] then
|
clasohm@362
|
506 |
"error: unexpected end of input\n"
|
clasohm@362
|
507 |
else
|
clasohm@367
|
508 |
"Syntax error at: " ^ quote (space_implode " " (map str_of_token
|
clasohm@367
|
509 |
((rev o tl o rev) toks)))
|
clasohm@362
|
510 |
^ "\n")
|
clasohm@362
|
511 |
^ "Expected tokens: "
|
clasohm@362
|
512 |
^ space_implode ", " (map (quote o str_of_token) allowed));
|
wenzelm@18
|
513 |
|
clasohm@362
|
514 |
fun produce stateset i indata prev_token =
|
clasohm@362
|
515 |
(*the argument prev_token is only used for error messages*)
|
wenzelm@237
|
516 |
(case Array.sub (stateset, i) of
|
clasohm@373
|
517 |
[] => let (*similar to token_assoc but does not automatically
|
clasohm@373
|
518 |
include 'None' key*)
|
clasohm@373
|
519 |
fun token_assoc2 (list, key) =
|
clasohm@373
|
520 |
let fun assoc [] = []
|
clasohm@373
|
521 |
| assoc ((keyi, xi) :: pairs) =
|
clasohm@373
|
522 |
if is_some keyi andalso
|
clasohm@373
|
523 |
matching_tokens (the keyi, key) then
|
clasohm@373
|
524 |
(assoc pairs) @ xi
|
clasohm@373
|
525 |
else assoc pairs;
|
clasohm@373
|
526 |
in assoc list end;
|
clasohm@373
|
527 |
|
clasohm@373
|
528 |
(*test if tk is a lookahead for a given minimum precedence*)
|
clasohm@373
|
529 |
fun reduction minPrec tk _ (Term _ :: _, _, prec:int) =
|
clasohm@373
|
530 |
if prec >= minPrec then true
|
clasohm@373
|
531 |
else false
|
clasohm@373
|
532 |
| reduction minPrec tk checked
|
clasohm@373
|
533 |
(Nonterm (rhss_ref, NTprec)::_,_, prec) =
|
clasohm@373
|
534 |
if prec >= minPrec andalso not (rhss_ref mem checked) then
|
clasohm@373
|
535 |
exists (reduction NTprec tk (rhss_ref :: checked))
|
clasohm@373
|
536 |
(token_assoc2 (!rhss_ref, tk))
|
clasohm@373
|
537 |
else false;
|
clasohm@373
|
538 |
|
clasohm@373
|
539 |
(*compute a list of allowed starting tokens
|
clasohm@373
|
540 |
for a list of nonterminals considering precedence*)
|
clasohm@362
|
541 |
fun get_starts [] = []
|
clasohm@372
|
542 |
| get_starts ((rhss_ref, minPrec:int) :: refs) =
|
clasohm@362
|
543 |
let fun get [] = []
|
clasohm@373
|
544 |
| get ((Some tk, prods) :: rhss) =
|
clasohm@373
|
545 |
if exists (reduction minPrec tk [rhss_ref]) prods
|
clasohm@373
|
546 |
then tk :: (get rhss)
|
clasohm@372
|
547 |
else get rhss
|
clasohm@362
|
548 |
| get ((None, _) :: rhss) =
|
clasohm@362
|
549 |
get rhss;
|
clasohm@362
|
550 |
in (get (!rhss_ref)) union (get_starts refs) end;
|
clasohm@362
|
551 |
|
clasohm@372
|
552 |
val NTs = map (fn (_, _, _, Nonterm (a, prec) :: _, _, _) =>
|
clasohm@372
|
553 |
(a, prec))
|
clasohm@362
|
554 |
(filter (fn (_, _, _, Nonterm _ :: _, _, _) => true
|
clasohm@362
|
555 |
| _ => false) (Array.sub (stateset, i-1)));
|
clasohm@367
|
556 |
val allowed = distinct (get_starts NTs @
|
clasohm@362
|
557 |
(map (fn (_, _, _, Term a :: _, _, _) => a)
|
clasohm@362
|
558 |
(filter (fn (_, _, _, Term _ :: _, _, _) => true
|
clasohm@367
|
559 |
| _ => false) (Array.sub (stateset, i-1)))));
|
clasohm@362
|
560 |
in syntax_error (if prev_token = EndToken then indata
|
clasohm@362
|
561 |
else prev_token :: indata) allowed
|
clasohm@362
|
562 |
end
|
wenzelm@237
|
563 |
| s =>
|
wenzelm@237
|
564 |
(case indata of
|
wenzelm@237
|
565 |
[] => Array.sub (stateset, i)
|
wenzelm@237
|
566 |
| c :: cs =>
|
wenzelm@237
|
567 |
let
|
clasohm@330
|
568 |
val (si, sii) = PROCESSS stateset i c s;
|
wenzelm@237
|
569 |
in
|
wenzelm@237
|
570 |
Array.update (stateset, i, si);
|
wenzelm@237
|
571 |
Array.update (stateset, i + 1, sii);
|
clasohm@362
|
572 |
produce stateset (i + 1) cs c
|
wenzelm@237
|
573 |
end));
|
wenzelm@18
|
574 |
|
wenzelm@18
|
575 |
|
wenzelm@237
|
576 |
val get_trees = mapfilter (fn (_, _, [pt], _, _, _) => Some pt | _ => None);
|
wenzelm@237
|
577 |
|
wenzelm@18
|
578 |
|
wenzelm@18
|
579 |
fun earley grammar startsymbol indata =
|
wenzelm@237
|
580 |
let
|
clasohm@624
|
581 |
val startsymbol' = case startsymbol of
|
clasohm@624
|
582 |
"logic" => "@logic_H"
|
clasohm@624
|
583 |
| "prop" => "@prop_H"
|
clasohm@624
|
584 |
| other => other;
|
clasohm@330
|
585 |
val rhss_ref = case assoc (grammar, startsymbol) of
|
clasohm@330
|
586 |
Some r => r
|
clasohm@624
|
587 |
| None => error ("parse: Unknown startsymbol " ^
|
clasohm@624
|
588 |
quote startsymbol);
|
clasohm@330
|
589 |
val S0 = [(ref [], 0, [], [Nonterm (rhss_ref, 0), Term EndToken], "", 0)];
|
clasohm@330
|
590 |
val s = length indata + 1;
|
wenzelm@237
|
591 |
val Estate = Array.array (s, []);
|
wenzelm@237
|
592 |
in
|
wenzelm@237
|
593 |
Array.update (Estate, 0, S0);
|
wenzelm@237
|
594 |
let
|
clasohm@362
|
595 |
val l = produce Estate 0 indata EndToken(*dummy*);
|
clasohm@624
|
596 |
|
wenzelm@237
|
597 |
val p_trees = get_trees l;
|
wenzelm@237
|
598 |
in p_trees end
|
wenzelm@237
|
599 |
end;
|
wenzelm@18
|
600 |
|
wenzelm@18
|
601 |
|
wenzelm@237
|
602 |
fun parse (Gram (_, prod_tab)) start toks =
|
clasohm@624
|
603 |
let val r =
|
wenzelm@237
|
604 |
(case earley prod_tab start toks of
|
wenzelm@237
|
605 |
[] => sys_error "parse: no parse trees"
|
clasohm@330
|
606 |
| pts => pts);
|
clasohm@624
|
607 |
in r end
|
wenzelm@18
|
608 |
|
wenzelm@18
|
609 |
end;
|