haftmann@24584
|
1 |
(* Title: HOL/Import/xml.ML
|
obua@19064
|
2 |
ID: $Id$
|
obua@19064
|
3 |
Author: David Aspinall, Stefan Berghofer and Markus Wenzel
|
obua@19064
|
4 |
|
obua@19064
|
5 |
Basic support for XML.
|
obua@19064
|
6 |
*)
|
obua@19064
|
7 |
|
obua@19064
|
8 |
signature XML =
|
obua@19064
|
9 |
sig
|
obua@19064
|
10 |
val header: string
|
obua@19064
|
11 |
val text: string -> string
|
obua@19064
|
12 |
val text_charref: string -> string
|
obua@19064
|
13 |
val cdata: string -> string
|
obua@19064
|
14 |
val element: string -> (string * string) list -> string list -> string
|
obua@19089
|
15 |
|
obua@19064
|
16 |
datatype tree =
|
obua@19064
|
17 |
Elem of string * (string * string) list * tree list
|
obua@19064
|
18 |
| Text of string
|
obua@19089
|
19 |
|
obua@19064
|
20 |
val string_of_tree: tree -> string
|
obua@19064
|
21 |
val tree_of_string: string -> tree
|
obua@19089
|
22 |
|
obua@19089
|
23 |
val encoded_string_of_tree : tree -> string
|
obua@19089
|
24 |
val tree_of_encoded_string : string -> tree
|
obua@19064
|
25 |
end;
|
obua@19064
|
26 |
|
obua@19089
|
27 |
structure XML :> XML =
|
obua@19064
|
28 |
struct
|
obua@19064
|
29 |
|
obua@19093
|
30 |
(*structure Seq = VectorScannerSeq
|
obua@19093
|
31 |
structure Scan = Scanner (structure Seq = Seq)*)
|
obua@19093
|
32 |
structure Seq = StringScannerSeq
|
obua@19093
|
33 |
structure Scan = StringScanner
|
obua@19093
|
34 |
|
obua@19095
|
35 |
|
obua@19064
|
36 |
open Scan
|
obua@19064
|
37 |
|
obua@19064
|
38 |
(** string based representation (small scale) **)
|
obua@19064
|
39 |
|
obua@19064
|
40 |
val header = "<?xml version=\"1.0\"?>\n";
|
obua@19064
|
41 |
|
obua@19064
|
42 |
(* text and character data *)
|
obua@19064
|
43 |
|
obua@19064
|
44 |
fun decode "<" = "<"
|
obua@19064
|
45 |
| decode ">" = ">"
|
obua@19064
|
46 |
| decode "&" = "&"
|
obua@19064
|
47 |
| decode "'" = "'"
|
obua@19064
|
48 |
| decode """ = "\""
|
obua@19064
|
49 |
| decode c = c;
|
obua@19064
|
50 |
|
obua@19064
|
51 |
fun encode "<" = "<"
|
obua@19064
|
52 |
| encode ">" = ">"
|
obua@19064
|
53 |
| encode "&" = "&"
|
obua@19064
|
54 |
| encode "'" = "'"
|
obua@19064
|
55 |
| encode "\"" = """
|
obua@19064
|
56 |
| encode c = c;
|
obua@19064
|
57 |
|
obua@19064
|
58 |
fun encode_charref c = "&#" ^ Int.toString (ord c) ^ ";"
|
obua@19064
|
59 |
|
obua@19064
|
60 |
val text = Library.translate_string encode
|
obua@19064
|
61 |
|
obua@19064
|
62 |
val text_charref = translate_string encode_charref;
|
obua@19064
|
63 |
|
obua@19064
|
64 |
val cdata = enclose "<![CDATA[" "]]>\n"
|
obua@19064
|
65 |
|
obua@19064
|
66 |
(* elements *)
|
obua@19064
|
67 |
|
obua@19064
|
68 |
fun attribute (a, x) = a ^ " = \"" ^ text x ^ "\"";
|
obua@19064
|
69 |
|
obua@19064
|
70 |
fun element name atts cs =
|
obua@19064
|
71 |
let val elem = space_implode " " (name :: map attribute atts) in
|
obua@19064
|
72 |
if null cs then enclose "<" "/>" elem
|
obua@19064
|
73 |
else enclose "<" ">" elem ^ implode cs ^ enclose "</" ">" name
|
obua@19064
|
74 |
end;
|
obua@19064
|
75 |
|
obua@19064
|
76 |
(** explicit XML trees **)
|
obua@19064
|
77 |
|
obua@19064
|
78 |
datatype tree =
|
obua@19064
|
79 |
Elem of string * (string * string) list * tree list
|
obua@19064
|
80 |
| Text of string;
|
obua@19064
|
81 |
|
obua@19064
|
82 |
fun string_of_tree tree =
|
obua@19064
|
83 |
let
|
obua@19064
|
84 |
fun string_of (Elem (name, atts, ts)) buf =
|
obua@19064
|
85 |
let val buf' =
|
obua@19064
|
86 |
buf |> Buffer.add "<"
|
obua@19064
|
87 |
|> fold Buffer.add (separate " " (name :: map attribute atts))
|
obua@19064
|
88 |
in
|
obua@19064
|
89 |
if null ts then
|
obua@19064
|
90 |
buf' |> Buffer.add "/>"
|
obua@19064
|
91 |
else
|
obua@19064
|
92 |
buf' |> Buffer.add ">"
|
obua@19064
|
93 |
|> fold string_of ts
|
obua@19064
|
94 |
|> Buffer.add "</" |> Buffer.add name |> Buffer.add ">"
|
obua@19064
|
95 |
end
|
obua@19064
|
96 |
| string_of (Text s) buf = Buffer.add (text s) buf;
|
obua@19064
|
97 |
in Buffer.content (string_of tree Buffer.empty) end;
|
obua@19064
|
98 |
|
obua@19064
|
99 |
(** XML parsing **)
|
obua@19064
|
100 |
|
obua@19089
|
101 |
fun beginning n xs = Symbol.beginning n (Seq.take_at_most xs n)
|
obua@19064
|
102 |
|
obua@19064
|
103 |
fun err s xs =
|
obua@19064
|
104 |
"XML parsing error: " ^ s ^ "\nfound: " ^ quote (beginning 100 xs) ;
|
obua@19064
|
105 |
|
obua@19064
|
106 |
val scan_whspc = Scan.any Symbol.is_blank;
|
obua@19064
|
107 |
|
obua@19064
|
108 |
val scan_special = $$ "&" ^^ scan_id ^^ $$ ";" >> decode;
|
obua@19064
|
109 |
|
obua@19064
|
110 |
val parse_chars = Scan.repeat1 (Scan.unless ((* scan_whspc -- *)$$ "<")
|
wenzelm@23784
|
111 |
(scan_special || Scan.one Symbol.is_regular)) >> implode;
|
obua@19064
|
112 |
|
obua@19064
|
113 |
val parse_cdata = Scan.this_string "<![CDATA[" |--
|
wenzelm@23784
|
114 |
(Scan.repeat (Scan.unless (Scan.this_string "]]>") (Scan.one Symbol.is_regular)) >>
|
obua@19064
|
115 |
implode) --| Scan.this_string "]]>";
|
obua@19064
|
116 |
|
obua@19064
|
117 |
val parse_att =
|
obua@19064
|
118 |
scan_id --| scan_whspc --| $$ "=" --| scan_whspc --
|
obua@19064
|
119 |
(($$ "\"" || $$ "'") :-- (fn s => (Scan.repeat (Scan.unless ($$ s)
|
wenzelm@23784
|
120 |
(scan_special || Scan.one Symbol.is_regular)) >> implode) --| $$ s) >> snd);
|
obua@19064
|
121 |
|
obua@19064
|
122 |
val parse_comment = Scan.this_string "<!--" --
|
wenzelm@23784
|
123 |
Scan.repeat (Scan.unless (Scan.this_string "-->") (Scan.one Symbol.is_regular)) --
|
obua@19064
|
124 |
Scan.this_string "-->";
|
obua@19064
|
125 |
|
obua@19064
|
126 |
val scan_comment_whspc =
|
obua@19064
|
127 |
(scan_whspc >> K()) --| (Scan.repeat (parse_comment |-- (scan_whspc >> K())));
|
obua@19064
|
128 |
|
obua@19064
|
129 |
val parse_pi = Scan.this_string "<?" |--
|
wenzelm@23784
|
130 |
Scan.repeat (Scan.unless (Scan.this_string "?>") (Scan.one Symbol.is_regular)) --|
|
obua@19064
|
131 |
Scan.this_string "?>";
|
obua@19064
|
132 |
|
obua@19064
|
133 |
fun parse_content xs =
|
obua@19064
|
134 |
((Scan.optional ((* scan_whspc |-- *) parse_chars >> (single o Text)) [] --
|
obua@19064
|
135 |
(Scan.repeat ((* scan_whspc |-- *)
|
obua@19064
|
136 |
( parse_elem >> single
|
obua@19064
|
137 |
|| parse_cdata >> (single o Text)
|
obua@19064
|
138 |
|| parse_pi >> K []
|
obua@19064
|
139 |
|| parse_comment >> K []) --
|
obua@19064
|
140 |
Scan.optional ((* scan_whspc |-- *) parse_chars >> (single o Text)) []
|
obua@19064
|
141 |
>> op @) >> List.concat) >> op @)(* --| scan_whspc*)) xs
|
obua@19064
|
142 |
|
obua@19064
|
143 |
and parse_elem xs =
|
obua@19064
|
144 |
($$ "<" |-- scan_id --
|
obua@19064
|
145 |
Scan.repeat (scan_whspc |-- parse_att) --| scan_whspc :-- (fn (s, _) =>
|
obua@19064
|
146 |
!! (err "Expected > or />")
|
obua@19064
|
147 |
(Scan.this_string "/>" >> K []
|
obua@19064
|
148 |
|| $$ ">" |-- parse_content --|
|
obua@19064
|
149 |
!! (err ("Expected </" ^ s ^ ">"))
|
obua@19064
|
150 |
(Scan.this_string ("</" ^ s) --| scan_whspc --| $$ ">"))) >>
|
obua@19064
|
151 |
(fn ((s, atts), ts) => Elem (s, atts, ts))) xs;
|
obua@19064
|
152 |
|
obua@19064
|
153 |
val parse_document =
|
obua@19064
|
154 |
Scan.option (Scan.this_string "<!DOCTYPE" -- scan_whspc |--
|
obua@19064
|
155 |
(Scan.repeat (Scan.unless ($$ ">")
|
wenzelm@23784
|
156 |
(Scan.one Symbol.is_regular)) >> implode) --| $$ ">" --| scan_whspc) --
|
obua@19064
|
157 |
parse_elem;
|
obua@19064
|
158 |
|
obua@19064
|
159 |
fun tree_of_string s =
|
obua@19064
|
160 |
let
|
obua@19089
|
161 |
val seq = Seq.fromString s
|
obua@19064
|
162 |
val scanner = !! (err "Malformed element") (scan_whspc |-- parse_elem --| scan_whspc)
|
obua@19064
|
163 |
val (x, toks) = scanner seq
|
obua@19064
|
164 |
in
|
obua@19089
|
165 |
if Seq.null toks then x else error ("Unprocessed input: '"^(beginning 100 toks)^"'")
|
obua@19064
|
166 |
end
|
obua@19089
|
167 |
|
obua@19089
|
168 |
(* More efficient saving and loading of xml trees employing a proprietary external format *)
|
obua@19089
|
169 |
|
obua@19089
|
170 |
fun write_lstring s buf = buf |> Buffer.add (string_of_int (size s)) |> Buffer.add " " |> Buffer.add s
|
obua@19089
|
171 |
fun parse_lstring toks = (scan_nat --| one Symbol.is_blank :-- (fn i => repeat_fixed i (one (K true))) >> snd >> implode) toks
|
obua@19089
|
172 |
|
obua@19089
|
173 |
fun write_list w l buf = buf |> Buffer.add (string_of_int (length l)) |> Buffer.add " " |> fold w l
|
obua@19089
|
174 |
fun parse_list sc = scan_nat --| one Symbol.is_blank :-- (fn i => repeat_fixed i sc) >> snd
|
obua@19089
|
175 |
|
obua@19089
|
176 |
fun write_tree (Text s) buf = buf |> Buffer.add "T" |> write_lstring s
|
obua@19089
|
177 |
| write_tree (Elem (name, attrs, children)) buf =
|
obua@19089
|
178 |
buf |> Buffer.add "E"
|
obua@19089
|
179 |
|> write_lstring name
|
obua@19089
|
180 |
|> write_list (fn (a,b) => fn buf => buf |> write_lstring a |> write_lstring b) attrs
|
obua@19089
|
181 |
|> write_list write_tree children
|
obua@19089
|
182 |
|
obua@19089
|
183 |
fun parse_tree toks = (one (K true) :-- (fn "T" => parse_lstring >> Text | "E" => parse_elem | _ => raise SyntaxError) >> snd) toks
|
obua@19089
|
184 |
and parse_elem toks = (parse_lstring -- parse_list (parse_lstring -- parse_lstring) -- parse_list parse_tree >> (fn ((n, a), c) => Elem (n,a,c))) toks
|
obua@19089
|
185 |
|
obua@19089
|
186 |
fun encoded_string_of_tree tree = Buffer.content (write_tree tree Buffer.empty)
|
obua@19089
|
187 |
|
obua@19089
|
188 |
fun tree_of_encoded_string s =
|
obua@19089
|
189 |
let
|
obua@19089
|
190 |
fun print (a,b) = writeln (a^" "^(string_of_int b))
|
obua@19089
|
191 |
val _ = print ("length of encoded string: ", size s)
|
obua@19089
|
192 |
val _ = writeln "Seq.fromString..."
|
obua@19089
|
193 |
val seq = timeit (fn () => Seq.fromString s)
|
obua@19089
|
194 |
val _ = print ("length of sequence", timeit (fn () => Seq.length seq))
|
obua@19089
|
195 |
val scanner = !! (err "Malformed encoded xml") parse_tree
|
obua@19089
|
196 |
val (x, toks) = scanner seq
|
obua@19089
|
197 |
in
|
obua@19089
|
198 |
if Seq.null toks then x else error ("Unprocessed input: '"^(beginning 100 toks)^"'")
|
obua@19089
|
199 |
end
|
obua@19089
|
200 |
|
obua@19064
|
201 |
end;
|