src/Pure/General/yxml.ML
author wenzelm
Wed, 27 Aug 2008 16:32:48 +0200
changeset 28023 92dd3ad302b7
parent 27932 7a28472be96b
child 28025 d9fcab768496
permissions -rw-r--r--
simplified parse_attrib (find_substring instead of space_explode);
wenzelm@26528
     1
(*  Title:      Pure/General/yxml.ML
wenzelm@26528
     2
    ID:         $Id$
wenzelm@26528
     3
    Author:     Makarius
wenzelm@26528
     4
wenzelm@26540
     5
Efficient text representation of XML trees using extra characters X
wenzelm@26540
     6
and Y -- no escaping, may nest marked text verbatim.
wenzelm@26528
     7
wenzelm@26540
     8
Markup <elem att="val" ...>...body...</elem> is encoded as:
wenzelm@26540
     9
wenzelm@26540
    10
  X Y name Y att=val ... X
wenzelm@26528
    11
  ...
wenzelm@26528
    12
  body
wenzelm@26528
    13
  ...
wenzelm@26540
    14
  X Y X
wenzelm@26528
    15
*)
wenzelm@26528
    16
wenzelm@26528
    17
signature YXML =
wenzelm@26528
    18
sig
wenzelm@26528
    19
  val detect: string -> bool
wenzelm@26540
    20
  val output_markup: Markup.T -> string * string
wenzelm@26528
    21
  val element: string -> XML.attributes -> string list -> string
wenzelm@26528
    22
  val string_of: XML.tree -> string
wenzelm@26528
    23
  val parse_body: string -> XML.tree list
wenzelm@26528
    24
  val parse: string -> XML.tree
wenzelm@26528
    25
end;
wenzelm@26528
    26
wenzelm@26528
    27
structure YXML: YXML =
wenzelm@26528
    28
struct
wenzelm@26528
    29
wenzelm@26540
    30
(** string representation **)
wenzelm@26528
    31
wenzelm@26547
    32
(* markers *)
wenzelm@26547
    33
wenzelm@26540
    34
val X = Symbol.ENQ;
wenzelm@26540
    35
val Y = Symbol.ACK;
wenzelm@26540
    36
val XY = X ^ Y;
wenzelm@26540
    37
val XYX = XY ^ X;
wenzelm@26528
    38
wenzelm@26540
    39
val detect = String.isPrefix XY;
wenzelm@26528
    40
wenzelm@26528
    41
wenzelm@26547
    42
(* output *)
wenzelm@26540
    43
wenzelm@27884
    44
fun output_markup (markup as (name, atts)) =
wenzelm@27884
    45
  if Markup.is_none markup then ("", "")
wenzelm@27884
    46
  else (XY ^ name ^ implode (map (fn (a, x) => Y ^ a ^ "=" ^ x) atts) ^ X, XYX);
wenzelm@26540
    47
wenzelm@26528
    48
fun element name atts body =
wenzelm@26540
    49
  let val (pre, post) = output_markup (name, atts)
wenzelm@26540
    50
  in pre ^ implode body ^ post end;
wenzelm@26528
    51
wenzelm@26528
    52
fun string_of t =
wenzelm@26528
    53
  let
wenzelm@26528
    54
    fun attrib (a, x) =
wenzelm@26540
    55
      Buffer.add Y #>
wenzelm@26528
    56
      Buffer.add a #> Buffer.add "=" #> Buffer.add x;
wenzelm@26528
    57
    fun tree (XML.Elem (name, atts, ts)) =
wenzelm@26540
    58
          Buffer.add XY #> Buffer.add name #> fold attrib atts #> Buffer.add X #>
wenzelm@26528
    59
          fold tree ts #>
wenzelm@26540
    60
          Buffer.add XYX
wenzelm@26528
    61
      | tree (XML.Text s) = Buffer.add s
wenzelm@26528
    62
      | tree (XML.Output s) = Buffer.add s;
wenzelm@26528
    63
  in Buffer.empty |> tree t |> Buffer.content end;
wenzelm@26528
    64
wenzelm@26528
    65
wenzelm@26540
    66
wenzelm@26540
    67
(** efficient YXML parsing **)
wenzelm@26528
    68
wenzelm@26528
    69
local
wenzelm@26528
    70
wenzelm@26540
    71
(* splitting *)
wenzelm@26540
    72
wenzelm@26540
    73
fun is_char s c = ord s = Char.ord c;
wenzelm@26540
    74
wenzelm@26540
    75
val split_string =
wenzelm@26540
    76
  Substring.full #>
wenzelm@26540
    77
  Substring.tokens (is_char X) #>
wenzelm@26540
    78
  map (Substring.fields (is_char Y) #> map Substring.string);
wenzelm@26540
    79
wenzelm@26540
    80
wenzelm@26540
    81
(* structural errors *)
wenzelm@26528
    82
wenzelm@26528
    83
fun err msg = raise Fail ("Malformed YXML encoding: " ^ msg);
wenzelm@26528
    84
fun err_attribute () = err "bad attribute";
wenzelm@26528
    85
fun err_element () = err "bad element";
wenzelm@26528
    86
fun err_unbalanced "" = err "unbalanced element"
wenzelm@26528
    87
  | err_unbalanced name = err ("unbalanced element " ^ quote name);
wenzelm@26528
    88
wenzelm@26528
    89
wenzelm@26528
    90
(* stack operations *)
wenzelm@26528
    91
wenzelm@26528
    92
fun add x ((elem, body) :: pending) = (elem, x :: body) :: pending;
wenzelm@26528
    93
wenzelm@26528
    94
fun push "" _ _ = err_element ()
wenzelm@26528
    95
  | push name atts pending = ((name, atts), []) :: pending;
wenzelm@26528
    96
wenzelm@26528
    97
fun pop ((("", _), _) :: _) = err_unbalanced ""
wenzelm@26528
    98
  | pop (((name, atts), body) :: pending) = add (XML.Elem (name, atts, rev body)) pending;
wenzelm@26528
    99
wenzelm@26528
   100
wenzelm@26540
   101
(* parsing *)
wenzelm@26528
   102
wenzelm@26528
   103
fun parse_attrib s =
wenzelm@28023
   104
  (case find_substring "=" s of
wenzelm@28023
   105
    NONE => err_attribute ()
wenzelm@28023
   106
  | SOME 0 => err_attribute ()
wenzelm@28023
   107
  | SOME i => (String.substring (s, 0, i), String.extract (s, i + 1, NONE)));
wenzelm@26528
   108
wenzelm@26540
   109
fun parse_chunk ["", ""] = pop
wenzelm@26540
   110
  | parse_chunk ("" :: name :: atts) = push name (map parse_attrib atts)
wenzelm@26540
   111
  | parse_chunk txts = fold (add o XML.Text) txts;
wenzelm@26528
   112
wenzelm@26528
   113
in
wenzelm@26528
   114
wenzelm@26528
   115
fun parse_body source =
wenzelm@26540
   116
  (case fold parse_chunk (split_string source) [(("", []), [])] of
wenzelm@26528
   117
    [(("", _), result)] => rev result
wenzelm@26528
   118
  | ((name, _), _) :: _ => err_unbalanced name);
wenzelm@26528
   119
wenzelm@26528
   120
fun parse source =
wenzelm@26528
   121
  (case parse_body source of
wenzelm@27798
   122
    [result] => result
wenzelm@27798
   123
  | [] => XML.Text ""
wenzelm@27798
   124
  | _ => err "multiple results");
wenzelm@26528
   125
wenzelm@26528
   126
end;
wenzelm@26528
   127
wenzelm@26528
   128
end;
wenzelm@26528
   129