wenzelm@18
|
1 |
(* Title: Pure/Syntax/parser.ML
|
wenzelm@46
|
2 |
ID: $Id$
|
wenzelm@18
|
3 |
Author: Sonia Mahjoub and Markus Wenzel, TU Muenchen
|
wenzelm@18
|
4 |
|
wenzelm@18
|
5 |
Isabelle's main parser (used for terms and typs).
|
wenzelm@18
|
6 |
|
wenzelm@18
|
7 |
TODO:
|
wenzelm@237
|
8 |
fix ambiguous grammar problem
|
wenzelm@18
|
9 |
~1 --> chain_pri
|
wenzelm@18
|
10 |
improve syntax error
|
wenzelm@237
|
11 |
extend_gram: remove 'roots' arg
|
wenzelm@18
|
12 |
*)
|
wenzelm@18
|
13 |
|
wenzelm@18
|
14 |
signature PARSER =
|
wenzelm@18
|
15 |
sig
|
wenzelm@237
|
16 |
structure Lexicon: LEXICON
|
wenzelm@237
|
17 |
structure SynExt: SYN_EXT
|
wenzelm@237
|
18 |
local open Lexicon SynExt SynExt.Ast in
|
wenzelm@18
|
19 |
type gram
|
wenzelm@18
|
20 |
val empty_gram: gram
|
wenzelm@237
|
21 |
val extend_gram: gram -> string list -> xprod list -> gram
|
wenzelm@237
|
22 |
val merge_grams: gram -> gram -> gram
|
wenzelm@237
|
23 |
val pretty_gram: gram -> Pretty.T list
|
wenzelm@237
|
24 |
datatype parsetree =
|
wenzelm@237
|
25 |
Node of string * parsetree list |
|
wenzelm@237
|
26 |
Tip of token
|
wenzelm@18
|
27 |
val parse: gram -> string -> token list -> parsetree
|
wenzelm@18
|
28 |
end
|
wenzelm@18
|
29 |
end;
|
wenzelm@18
|
30 |
|
wenzelm@237
|
31 |
functor ParserFun(structure Symtab: SYMTAB and Lexicon: LEXICON
|
wenzelm@237
|
32 |
and SynExt: SYN_EXT)(*: PARSER *) = (* FIXME *)
|
wenzelm@18
|
33 |
struct
|
wenzelm@18
|
34 |
|
wenzelm@237
|
35 |
structure Pretty = SynExt.Ast.Pretty;
|
wenzelm@237
|
36 |
structure Lexicon = Lexicon;
|
wenzelm@237
|
37 |
structure SynExt = SynExt;
|
wenzelm@237
|
38 |
open Lexicon SynExt;
|
wenzelm@18
|
39 |
|
wenzelm@18
|
40 |
|
wenzelm@18
|
41 |
(** datatype gram **)
|
wenzelm@18
|
42 |
|
wenzelm@237
|
43 |
datatype symb =
|
wenzelm@237
|
44 |
Terminal of token |
|
wenzelm@237
|
45 |
Nonterminal of string * int;
|
wenzelm@18
|
46 |
|
wenzelm@18
|
47 |
datatype gram =
|
wenzelm@237
|
48 |
Gram of (string * (symb list * string * int)) list
|
wenzelm@237
|
49 |
* (symb list * string * int) list Symtab.table;
|
wenzelm@18
|
50 |
|
wenzelm@237
|
51 |
fun mk_gram prods = Gram (prods, Symtab.make_multi prods);
|
wenzelm@18
|
52 |
|
wenzelm@18
|
53 |
|
wenzelm@237
|
54 |
(* empty, extend, merge grams *)
|
wenzelm@18
|
55 |
|
wenzelm@237
|
56 |
val empty_gram = mk_gram [];
|
wenzelm@18
|
57 |
|
wenzelm@237
|
58 |
fun extend_gram (gram1 as Gram (prods1, _)) _ xprods2 =
|
wenzelm@18
|
59 |
let
|
wenzelm@237
|
60 |
fun symb_of (Delim s) = Some (Terminal (Token s))
|
wenzelm@237
|
61 |
| symb_of (Argument (s, p)) =
|
wenzelm@18
|
62 |
(case predef_term s of
|
wenzelm@237
|
63 |
None => Some (Nonterminal (s, p))
|
wenzelm@237
|
64 |
| Some tk => Some (Terminal tk))
|
wenzelm@18
|
65 |
| symb_of _ = None;
|
wenzelm@18
|
66 |
|
wenzelm@237
|
67 |
fun prod_of (XProd (lhs, xsymbs, const, pri)) =
|
wenzelm@237
|
68 |
(lhs, (mapfilter symb_of xsymbs, const, pri));
|
wenzelm@18
|
69 |
|
wenzelm@237
|
70 |
val prods2 = distinct (map prod_of xprods2);
|
wenzelm@18
|
71 |
in
|
wenzelm@237
|
72 |
if prods2 subset prods1 then gram1
|
wenzelm@237
|
73 |
else mk_gram (extend_list prods1 prods2)
|
wenzelm@18
|
74 |
end;
|
wenzelm@18
|
75 |
|
wenzelm@237
|
76 |
fun merge_grams (gram1 as Gram (prods1, _)) (gram2 as Gram (prods2, _)) =
|
wenzelm@237
|
77 |
if prods2 subset prods1 then gram1
|
wenzelm@237
|
78 |
else if prods1 subset prods2 then gram2
|
wenzelm@237
|
79 |
else mk_gram (merge_lists prods1 prods2);
|
wenzelm@18
|
80 |
|
wenzelm@18
|
81 |
|
wenzelm@237
|
82 |
(* pretty_gram *)
|
wenzelm@18
|
83 |
|
wenzelm@237
|
84 |
fun pretty_gram (Gram (prods, _)) =
|
wenzelm@237
|
85 |
let
|
wenzelm@237
|
86 |
fun pretty_name name = [Pretty.str (name ^ " =")];
|
wenzelm@18
|
87 |
|
wenzelm@237
|
88 |
fun pretty_symb (Terminal (Token s)) = Pretty.str (quote s)
|
wenzelm@237
|
89 |
| pretty_symb (Terminal tok) = Pretty.str (str_of_token tok)
|
wenzelm@237
|
90 |
| pretty_symb (Nonterminal (s, p)) =
|
wenzelm@237
|
91 |
Pretty.str (s ^ "[" ^ string_of_int p ^ "]");
|
wenzelm@18
|
92 |
|
wenzelm@237
|
93 |
fun pretty_const "" = []
|
wenzelm@237
|
94 |
| pretty_const c = [Pretty.str ("=> " ^ quote c)];
|
wenzelm@237
|
95 |
|
wenzelm@237
|
96 |
fun pretty_pri p = [Pretty.str ("(" ^ string_of_int p ^ ")")];
|
wenzelm@237
|
97 |
|
wenzelm@237
|
98 |
fun pretty_prod (name, (symbs, const, pri)) =
|
wenzelm@237
|
99 |
Pretty.block (Pretty.breaks (pretty_name name @
|
wenzelm@237
|
100 |
map pretty_symb symbs @ pretty_const const @ pretty_pri pri));
|
wenzelm@18
|
101 |
in
|
wenzelm@237
|
102 |
map pretty_prod prods
|
wenzelm@18
|
103 |
end;
|
wenzelm@18
|
104 |
|
wenzelm@18
|
105 |
|
wenzelm@18
|
106 |
|
wenzelm@18
|
107 |
(** parse **)
|
wenzelm@18
|
108 |
|
wenzelm@237
|
109 |
datatype parsetree =
|
wenzelm@237
|
110 |
Node of string * parsetree list |
|
wenzelm@237
|
111 |
Tip of token;
|
wenzelm@237
|
112 |
|
wenzelm@18
|
113 |
type state =
|
wenzelm@18
|
114 |
string * int
|
wenzelm@18
|
115 |
* parsetree list (*before point*)
|
wenzelm@18
|
116 |
* symb list (*after point*)
|
wenzelm@18
|
117 |
* string * int;
|
wenzelm@18
|
118 |
|
wenzelm@18
|
119 |
type earleystate = state list Array.array;
|
wenzelm@18
|
120 |
|
wenzelm@18
|
121 |
|
wenzelm@18
|
122 |
|
wenzelm@237
|
123 |
fun get_prods tab lhs pred =
|
wenzelm@237
|
124 |
filter pred (Symtab.lookup_multi (tab, lhs));
|
wenzelm@237
|
125 |
|
wenzelm@18
|
126 |
fun getProductions tab A i =
|
wenzelm@18
|
127 |
get_prods tab A (fn (_, _, j) => j >= i orelse j = ~1);
|
wenzelm@18
|
128 |
|
wenzelm@18
|
129 |
fun getProductions' tab A i k =
|
wenzelm@18
|
130 |
get_prods tab A (fn (_, _, j) => (j >= i andalso j < k) orelse j = ~1);
|
wenzelm@18
|
131 |
|
wenzelm@18
|
132 |
|
wenzelm@18
|
133 |
|
wenzelm@237
|
134 |
fun mkState i jj A ([Nonterminal (B, ~1)], id, ~1) =
|
wenzelm@237
|
135 |
(A, max_pri, [], [Nonterminal (B, jj)], id, i)
|
wenzelm@237
|
136 |
| mkState i jj A (ss, id, j) = (A, j, [], ss, id, i);
|
wenzelm@18
|
137 |
|
wenzelm@18
|
138 |
|
wenzelm@18
|
139 |
|
wenzelm@237
|
140 |
fun conc (t, k:int) [] = (None, [(t, k)])
|
wenzelm@237
|
141 |
| conc (t, k) ((t', k') :: ts) =
|
wenzelm@237
|
142 |
if t = t' then
|
wenzelm@237
|
143 |
(Some k', if k' >= k then (t', k') :: ts else (t, k) :: ts)
|
wenzelm@237
|
144 |
else
|
wenzelm@237
|
145 |
let val (n, ts') = conc (t, k) ts
|
wenzelm@237
|
146 |
in (n, (t', k') :: ts') end;
|
wenzelm@18
|
147 |
|
wenzelm@18
|
148 |
|
wenzelm@237
|
149 |
fun update_tree ((B, (i, ts)) :: used) (A, t) =
|
wenzelm@237
|
150 |
if A = B then
|
wenzelm@237
|
151 |
let val (n, ts') = conc t ts
|
wenzelm@237
|
152 |
in ((A, (i, ts')) :: used, n) end
|
wenzelm@237
|
153 |
else
|
wenzelm@237
|
154 |
let val (used', n) = update_tree used (A, t)
|
wenzelm@237
|
155 |
in ((B, (i, ts)) :: used', n) end;
|
wenzelm@18
|
156 |
|
wenzelm@18
|
157 |
|
wenzelm@237
|
158 |
fun update_index ((B, (i, ts)) :: used) (A, j) =
|
wenzelm@237
|
159 |
if A = B then (A, (j, ts)) :: used
|
wenzelm@237
|
160 |
else (B, (i, ts)) :: (update_index used (A, j));
|
wenzelm@18
|
161 |
|
wenzelm@18
|
162 |
|
wenzelm@18
|
163 |
|
wenzelm@18
|
164 |
fun getS A i Si =
|
wenzelm@237
|
165 |
filter
|
wenzelm@237
|
166 |
(fn (_, _, _, Nonterminal (B, j) :: _, _, _) => A = B andalso j <= i
|
wenzelm@237
|
167 |
| _ => false) Si;
|
wenzelm@18
|
168 |
|
wenzelm@18
|
169 |
fun getS' A k n Si =
|
wenzelm@237
|
170 |
filter
|
wenzelm@237
|
171 |
(fn (_, _, _, Nonterminal (B, j) :: _, _, _) => A = B andalso j <= k andalso j > n
|
wenzelm@237
|
172 |
| _ => false) Si;
|
wenzelm@18
|
173 |
|
wenzelm@18
|
174 |
fun getStates Estate i ii A k =
|
wenzelm@237
|
175 |
filter
|
wenzelm@237
|
176 |
(fn (_, _, _, Nonterminal (B, j) :: _, _, _) => A = B andalso j <= k
|
wenzelm@237
|
177 |
| _ => false)
|
wenzelm@237
|
178 |
(Array.sub (Estate, ii));
|
wenzelm@18
|
179 |
|
wenzelm@18
|
180 |
|
wenzelm@18
|
181 |
(* MMW *)(* FIXME *)
|
wenzelm@237
|
182 |
fun movedot_term (A, j, ts, Terminal a :: sa, id, i) c =
|
wenzelm@237
|
183 |
if valued_token c then
|
wenzelm@237
|
184 |
(A, j, (ts @ [Tip c]), sa, id, i)
|
wenzelm@237
|
185 |
else (A, j, ts, sa, id, i);
|
wenzelm@18
|
186 |
|
wenzelm@237
|
187 |
fun movedot_nonterm ts (A, j, tss, Nonterminal (B, k) :: sa, id, i) =
|
wenzelm@237
|
188 |
(A, j, tss @ ts, sa, id, i);
|
wenzelm@18
|
189 |
|
wenzelm@237
|
190 |
fun movedot_lambda _ [] = []
|
wenzelm@237
|
191 |
| movedot_lambda (B, j, tss, Nonterminal (A, k) :: sa, id, i) ((t, ki) :: ts) =
|
wenzelm@237
|
192 |
if k <= ki then
|
wenzelm@237
|
193 |
(B, j, tss @ t, sa, id, i) ::
|
wenzelm@237
|
194 |
movedot_lambda (B, j, tss, Nonterminal (A, k) :: sa, id, i) ts
|
wenzelm@237
|
195 |
else movedot_lambda (B, j, tss, Nonterminal (A, k) :: sa, id, i) ts;
|
wenzelm@18
|
196 |
|
wenzelm@18
|
197 |
|
wenzelm@18
|
198 |
|
wenzelm@18
|
199 |
fun PROCESSS Estate grammar i c states =
|
wenzelm@18
|
200 |
let
|
wenzelm@237
|
201 |
fun processS used [] (Si, Sii) = (Si, Sii)
|
wenzelm@237
|
202 |
| processS used (S :: States) (Si, Sii) =
|
wenzelm@237
|
203 |
(case S of
|
wenzelm@237
|
204 |
(_, _, _, Nonterminal (A, j) :: _, _, _) =>
|
wenzelm@18
|
205 |
let
|
wenzelm@18
|
206 |
val (used_neu, States_neu) =
|
wenzelm@237
|
207 |
(case assoc (used, A) of
|
wenzelm@237
|
208 |
Some (k, l) => (*A gehoert zu used*)
|
wenzelm@237
|
209 |
if k <= j (*Prioritaet wurde behandelt*)
|
wenzelm@237
|
210 |
then (used, movedot_lambda S l)
|
wenzelm@237
|
211 |
else (*Prioritaet wurde noch nicht behandelt*)
|
wenzelm@237
|
212 |
let
|
wenzelm@237
|
213 |
val L = getProductions' grammar A j k;
|
wenzelm@237
|
214 |
val States' = map (mkState i j A) L;
|
wenzelm@237
|
215 |
in
|
wenzelm@237
|
216 |
(update_index used (A, j), States' @ movedot_lambda S l)
|
wenzelm@237
|
217 |
end
|
wenzelm@18
|
218 |
|
wenzelm@237
|
219 |
| None => (*A gehoert nicht zu used*)
|
wenzelm@237
|
220 |
let
|
wenzelm@237
|
221 |
val L = getProductions grammar A j;
|
wenzelm@237
|
222 |
val States' = map (mkState i j A) L;
|
wenzelm@237
|
223 |
in
|
wenzelm@237
|
224 |
((A, (j, [])) :: used, States')
|
wenzelm@237
|
225 |
end)
|
wenzelm@237
|
226 |
in
|
wenzelm@237
|
227 |
processS used_neu (States @ States_neu) (S :: Si, Sii)
|
wenzelm@237
|
228 |
end
|
wenzelm@237
|
229 |
| (_, _, _, Terminal a :: _, _, _) =>
|
wenzelm@237
|
230 |
processS used States
|
wenzelm@237
|
231 |
(S :: Si,
|
wenzelm@237
|
232 |
if matching_tokens (a, c) then movedot_term S c :: Sii else Sii)
|
wenzelm@237
|
233 |
(* MMW *)(* FIXME *)
|
wenzelm@18
|
234 |
|
wenzelm@237
|
235 |
| (A, k, ts, [], id, j) =>
|
wenzelm@237
|
236 |
let
|
wenzelm@237
|
237 |
val tt = if id = "" then ts else [Node (id, ts)]
|
wenzelm@237
|
238 |
in
|
wenzelm@237
|
239 |
if j = i then
|
wenzelm@237
|
240 |
let
|
wenzelm@237
|
241 |
val (used', O) = update_tree used (A, (tt, k));
|
wenzelm@237
|
242 |
in
|
wenzelm@237
|
243 |
(case O of
|
wenzelm@237
|
244 |
None =>
|
wenzelm@237
|
245 |
let
|
wenzelm@237
|
246 |
val Slist = getS A k Si;
|
wenzelm@237
|
247 |
val States' = map (movedot_nonterm tt) Slist;
|
wenzelm@237
|
248 |
in
|
wenzelm@237
|
249 |
processS used' (States @ States') (S :: Si, Sii)
|
wenzelm@237
|
250 |
end
|
wenzelm@237
|
251 |
| Some n =>
|
wenzelm@237
|
252 |
if n >= k then
|
wenzelm@237
|
253 |
processS used' States (S :: Si, Sii)
|
wenzelm@237
|
254 |
else
|
wenzelm@237
|
255 |
let
|
wenzelm@237
|
256 |
val Slist = getS' A k n Si;
|
wenzelm@237
|
257 |
val States' = map (movedot_nonterm tt) Slist;
|
wenzelm@18
|
258 |
in
|
wenzelm@237
|
259 |
processS used' (States @ States') (S :: Si, Sii)
|
wenzelm@237
|
260 |
end)
|
wenzelm@237
|
261 |
end
|
wenzelm@237
|
262 |
else
|
wenzelm@237
|
263 |
processS used
|
wenzelm@237
|
264 |
(States @ map (movedot_nonterm tt) (getStates Estate i j A k))
|
wenzelm@237
|
265 |
(S :: Si, Sii)
|
wenzelm@237
|
266 |
end)
|
wenzelm@18
|
267 |
in
|
wenzelm@237
|
268 |
processS [] states ([], [])
|
wenzelm@18
|
269 |
end;
|
wenzelm@18
|
270 |
|
wenzelm@18
|
271 |
|
wenzelm@18
|
272 |
|
wenzelm@18
|
273 |
fun syntax_error toks =
|
wenzelm@18
|
274 |
error ("Syntax error at: " ^ space_implode " " (map str_of_token toks));
|
wenzelm@18
|
275 |
|
wenzelm@237
|
276 |
fun produce grammar stateset i indata =
|
wenzelm@237
|
277 |
(case Array.sub (stateset, i) of
|
wenzelm@237
|
278 |
[] => syntax_error indata (* MMW *)(* FIXME *)
|
wenzelm@237
|
279 |
| s =>
|
wenzelm@237
|
280 |
(case indata of
|
wenzelm@237
|
281 |
[] => Array.sub (stateset, i)
|
wenzelm@237
|
282 |
| c :: cs =>
|
wenzelm@237
|
283 |
let
|
wenzelm@237
|
284 |
val (si, sii) = PROCESSS stateset grammar i c s;
|
wenzelm@237
|
285 |
in
|
wenzelm@237
|
286 |
Array.update (stateset, i, si);
|
wenzelm@237
|
287 |
Array.update (stateset, i + 1, sii);
|
wenzelm@237
|
288 |
produce grammar stateset (i + 1) cs
|
wenzelm@237
|
289 |
end));
|
wenzelm@18
|
290 |
|
wenzelm@18
|
291 |
|
wenzelm@237
|
292 |
(*(* FIXME new *)
|
wenzelm@237
|
293 |
val get_trees = mapfilter (fn (_, _, [pt], _, _, _) => Some pt | _ => None);
|
wenzelm@237
|
294 |
*)
|
wenzelm@237
|
295 |
|
wenzelm@18
|
296 |
fun get_trees [] = []
|
wenzelm@237
|
297 |
| get_trees ((_, _, pt_lst, _, _, _) :: stateset) =
|
wenzelm@237
|
298 |
let val l = get_trees stateset
|
wenzelm@237
|
299 |
in case pt_lst of
|
wenzelm@237
|
300 |
[ptree] => ptree :: l
|
wenzelm@237
|
301 |
| _ => l
|
wenzelm@237
|
302 |
end;
|
wenzelm@18
|
303 |
|
wenzelm@18
|
304 |
fun earley grammar startsymbol indata =
|
wenzelm@237
|
305 |
let
|
wenzelm@237
|
306 |
val S0 = [("S'", 0, [], [Nonterminal (startsymbol, 0), Terminal EndToken], "", 0)];
|
wenzelm@237
|
307 |
val s = length indata + 1; (* MMW *)(* FIXME was .. + 2 *)
|
wenzelm@237
|
308 |
val Estate = Array.array (s, []);
|
wenzelm@237
|
309 |
in
|
wenzelm@237
|
310 |
Array.update (Estate, 0, S0);
|
wenzelm@237
|
311 |
let
|
wenzelm@237
|
312 |
val l = produce grammar Estate 0 indata; (* MMW *)(* FIXME was .. @ [EndToken] *)
|
wenzelm@237
|
313 |
val p_trees = get_trees l;
|
wenzelm@237
|
314 |
in p_trees end
|
wenzelm@237
|
315 |
end;
|
wenzelm@18
|
316 |
|
wenzelm@18
|
317 |
|
wenzelm@18
|
318 |
(* FIXME demo *)
|
wenzelm@237
|
319 |
fun parse (Gram (_, prod_tab)) start toks =
|
wenzelm@237
|
320 |
(case earley prod_tab start toks of
|
wenzelm@237
|
321 |
[] => sys_error "parse: no parse trees"
|
wenzelm@237
|
322 |
| pt :: _ => pt);
|
wenzelm@18
|
323 |
|
wenzelm@18
|
324 |
|
wenzelm@18
|
325 |
end;
|
wenzelm@18
|
326 |
|