wenzelm@26528
|
1 |
(* Title: Pure/General/yxml.ML
|
wenzelm@26528
|
2 |
ID: $Id$
|
wenzelm@26528
|
3 |
Author: Makarius
|
wenzelm@26528
|
4 |
|
wenzelm@26540
|
5 |
Efficient text representation of XML trees using extra characters X
|
wenzelm@26540
|
6 |
and Y -- no escaping, may nest marked text verbatim.
|
wenzelm@26528
|
7 |
|
wenzelm@26540
|
8 |
Markup <elem att="val" ...>...body...</elem> is encoded as:
|
wenzelm@26540
|
9 |
|
wenzelm@26540
|
10 |
X Y name Y att=val ... X
|
wenzelm@26528
|
11 |
...
|
wenzelm@26528
|
12 |
body
|
wenzelm@26528
|
13 |
...
|
wenzelm@26540
|
14 |
X Y X
|
wenzelm@26528
|
15 |
*)
|
wenzelm@26528
|
16 |
|
wenzelm@26528
|
17 |
signature YXML =
|
wenzelm@26528
|
18 |
sig
|
wenzelm@26528
|
19 |
val detect: string -> bool
|
wenzelm@26540
|
20 |
val output_markup: Markup.T -> string * string
|
wenzelm@26528
|
21 |
val element: string -> XML.attributes -> string list -> string
|
wenzelm@26528
|
22 |
val string_of: XML.tree -> string
|
wenzelm@26528
|
23 |
val parse_body: string -> XML.tree list
|
wenzelm@26528
|
24 |
val parse: string -> XML.tree
|
wenzelm@26528
|
25 |
end;
|
wenzelm@26528
|
26 |
|
wenzelm@26528
|
27 |
structure YXML: YXML =
|
wenzelm@26528
|
28 |
struct
|
wenzelm@26528
|
29 |
|
wenzelm@26540
|
30 |
(** string representation **)
|
wenzelm@26528
|
31 |
|
wenzelm@26547
|
32 |
(* markers *)
|
wenzelm@26547
|
33 |
|
wenzelm@26540
|
34 |
val X = Symbol.ENQ;
|
wenzelm@26540
|
35 |
val Y = Symbol.ACK;
|
wenzelm@26540
|
36 |
val XY = X ^ Y;
|
wenzelm@26540
|
37 |
val XYX = XY ^ X;
|
wenzelm@26528
|
38 |
|
wenzelm@26540
|
39 |
val detect = String.isPrefix XY;
|
wenzelm@26528
|
40 |
|
wenzelm@26528
|
41 |
|
wenzelm@26547
|
42 |
(* output *)
|
wenzelm@26540
|
43 |
|
wenzelm@27884
|
44 |
fun output_markup (markup as (name, atts)) =
|
wenzelm@27884
|
45 |
if Markup.is_none markup then ("", "")
|
wenzelm@27884
|
46 |
else (XY ^ name ^ implode (map (fn (a, x) => Y ^ a ^ "=" ^ x) atts) ^ X, XYX);
|
wenzelm@26540
|
47 |
|
wenzelm@26528
|
48 |
fun element name atts body =
|
wenzelm@26540
|
49 |
let val (pre, post) = output_markup (name, atts)
|
wenzelm@26540
|
50 |
in pre ^ implode body ^ post end;
|
wenzelm@26528
|
51 |
|
wenzelm@26528
|
52 |
fun string_of t =
|
wenzelm@26528
|
53 |
let
|
wenzelm@26528
|
54 |
fun attrib (a, x) =
|
wenzelm@26540
|
55 |
Buffer.add Y #>
|
wenzelm@26528
|
56 |
Buffer.add a #> Buffer.add "=" #> Buffer.add x;
|
wenzelm@26528
|
57 |
fun tree (XML.Elem (name, atts, ts)) =
|
wenzelm@26540
|
58 |
Buffer.add XY #> Buffer.add name #> fold attrib atts #> Buffer.add X #>
|
wenzelm@26528
|
59 |
fold tree ts #>
|
wenzelm@26540
|
60 |
Buffer.add XYX
|
wenzelm@26528
|
61 |
| tree (XML.Text s) = Buffer.add s
|
wenzelm@26528
|
62 |
| tree (XML.Output s) = Buffer.add s;
|
wenzelm@26528
|
63 |
in Buffer.empty |> tree t |> Buffer.content end;
|
wenzelm@26528
|
64 |
|
wenzelm@26528
|
65 |
|
wenzelm@26540
|
66 |
|
wenzelm@26540
|
67 |
(** efficient YXML parsing **)
|
wenzelm@26528
|
68 |
|
wenzelm@26528
|
69 |
local
|
wenzelm@26528
|
70 |
|
wenzelm@26540
|
71 |
(* splitting *)
|
wenzelm@26540
|
72 |
|
wenzelm@26540
|
73 |
fun is_char s c = ord s = Char.ord c;
|
wenzelm@26540
|
74 |
|
wenzelm@26540
|
75 |
val split_string =
|
wenzelm@26540
|
76 |
Substring.full #>
|
wenzelm@26540
|
77 |
Substring.tokens (is_char X) #>
|
wenzelm@26540
|
78 |
map (Substring.fields (is_char Y) #> map Substring.string);
|
wenzelm@26540
|
79 |
|
wenzelm@26540
|
80 |
|
wenzelm@26540
|
81 |
(* structural errors *)
|
wenzelm@26528
|
82 |
|
wenzelm@26528
|
83 |
fun err msg = raise Fail ("Malformed YXML encoding: " ^ msg);
|
wenzelm@26528
|
84 |
fun err_attribute () = err "bad attribute";
|
wenzelm@26528
|
85 |
fun err_element () = err "bad element";
|
wenzelm@26528
|
86 |
fun err_unbalanced "" = err "unbalanced element"
|
wenzelm@26528
|
87 |
| err_unbalanced name = err ("unbalanced element " ^ quote name);
|
wenzelm@26528
|
88 |
|
wenzelm@26528
|
89 |
|
wenzelm@26528
|
90 |
(* stack operations *)
|
wenzelm@26528
|
91 |
|
wenzelm@26528
|
92 |
fun add x ((elem, body) :: pending) = (elem, x :: body) :: pending;
|
wenzelm@26528
|
93 |
|
wenzelm@26528
|
94 |
fun push "" _ _ = err_element ()
|
wenzelm@26528
|
95 |
| push name atts pending = ((name, atts), []) :: pending;
|
wenzelm@26528
|
96 |
|
wenzelm@26528
|
97 |
fun pop ((("", _), _) :: _) = err_unbalanced ""
|
wenzelm@26528
|
98 |
| pop (((name, atts), body) :: pending) = add (XML.Elem (name, atts, rev body)) pending;
|
wenzelm@26528
|
99 |
|
wenzelm@26528
|
100 |
|
wenzelm@26540
|
101 |
(* parsing *)
|
wenzelm@26528
|
102 |
|
wenzelm@26528
|
103 |
fun parse_attrib s =
|
wenzelm@28023
|
104 |
(case find_substring "=" s of
|
wenzelm@28023
|
105 |
NONE => err_attribute ()
|
wenzelm@28023
|
106 |
| SOME 0 => err_attribute ()
|
wenzelm@28023
|
107 |
| SOME i => (String.substring (s, 0, i), String.extract (s, i + 1, NONE)));
|
wenzelm@26528
|
108 |
|
wenzelm@26540
|
109 |
fun parse_chunk ["", ""] = pop
|
wenzelm@26540
|
110 |
| parse_chunk ("" :: name :: atts) = push name (map parse_attrib atts)
|
wenzelm@26540
|
111 |
| parse_chunk txts = fold (add o XML.Text) txts;
|
wenzelm@26528
|
112 |
|
wenzelm@26528
|
113 |
in
|
wenzelm@26528
|
114 |
|
wenzelm@26528
|
115 |
fun parse_body source =
|
wenzelm@26540
|
116 |
(case fold parse_chunk (split_string source) [(("", []), [])] of
|
wenzelm@26528
|
117 |
[(("", _), result)] => rev result
|
wenzelm@26528
|
118 |
| ((name, _), _) :: _ => err_unbalanced name);
|
wenzelm@26528
|
119 |
|
wenzelm@26528
|
120 |
fun parse source =
|
wenzelm@26528
|
121 |
(case parse_body source of
|
wenzelm@27798
|
122 |
[result] => result
|
wenzelm@27798
|
123 |
| [] => XML.Text ""
|
wenzelm@27798
|
124 |
| _ => err "multiple results");
|
wenzelm@26528
|
125 |
|
wenzelm@26528
|
126 |
end;
|
wenzelm@26528
|
127 |
|
wenzelm@26528
|
128 |
end;
|
wenzelm@26528
|
129 |
|